def _dump_logs(self) -> None: """ Write log. """ time_elapsed = time.time() - self.start_time fps = int(self.num_timesteps / (time_elapsed + 1e-8)) self.logger.record("time/episodes", self._episode_num, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: self.logger.record( "rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])) self.logger.record( "rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])) self.logger.record("time/fps", fps) self.logger.record("time/time_elapsed", int(time_elapsed), exclude="tensorboard") self.logger.record("time/total timesteps", self.num_timesteps, exclude="tensorboard") if self.use_sde: self.logger.record("train/std", (self.actor.get_std()).mean().item()) if len(self.ep_success_buffer) > 0: self.logger.record("rollout/success rate", safe_mean(self.ep_success_buffer)) # Pass the number of timesteps for tensorboard self.logger.dump(step=self.num_timesteps)
def step_wait(self): if self.needs_reset: raise RuntimeError( 'Tried to step vectorized environment that needs reset!') obss, rews, dones, infos = self.venv.step_wait() self.curr_ep_rewards += rews self.curr_ep_lengths += 1 new_infos = list(infos[:]) for key in self.curr_ep_data: self.curr_ep_data[key] += [ info[key] for info in infos ] #[dk for dk in map(lambda d: d[key], infos)] for i in range(len(dones)): if dones[i]: info = infos[i].copy() ep_rew = self.curr_ep_rewards[i] ep_len = self.curr_ep_lengths[i] ep_time = round(time.time() - self.t_start, 6) ep_info = {'r': ep_rew, 'l': ep_len, 't': ep_time} for key in self.curr_ep_data: # Change in behavior: grab only the values in episode that would be overwritten ep_info[key] = self.curr_ep_data[key][i] self.curr_ep_data[key][i] = 0 self.episode_rewards.append(ep_rew) self.episode_lengths.append(ep_len) self.episode_times.append(ep_time) self.curr_ep_rewards[i] = 0 self.curr_ep_lengths[i] = 0 if self.logger: for key in self.curr_rollout_data: self.curr_rollout_data[key].append(ep_info[key]) info['episode'] = ep_info new_infos[i] = info self.total_steps += self.num_envs self.step_idx_in_rollout += 1 if self.step_idx_in_rollout == self.rollout_size: if self.logger: # Correct the value for time (a bit ugly, I know) if 't' in self.curr_rollout_data: self.curr_rollout_data['t'] = [time.time() - self.t_start] # Store the average values per rollout self.logger.writerow({ k: safe_mean(self.curr_rollout_data[k]) for k in self.curr_rollout_data }) self.file_handler.flush() for key in self.info_keywords: logger.record(key, safe_mean(self.curr_rollout_data[key])) for key in self.curr_rollout_data: self.curr_rollout_data[key] = [] self.step_idx_in_rollout = 0 return obss, rews, dones, new_infos
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name ) callback.on_training_start(locals(), globals()) # debug =============================================================== if mode == 'debug': print(['OPA.learn started, ready to loop (OPA.collect_rollouts + OPA.train)']) while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # debug =========================================================== if mode == 'debug': print(['OPA.learn', 'num_timesteps:', self.num_timesteps, 'total_timesteps:', total_timesteps]) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) # debug =============================================================== if mode == 'debug': print(['OPA.learn finished, ready to OPA.train']) self.train() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="run", eval_env=None, eval_freq=-1, n_eval_episodes=5, eval_log_path=None, reset_num_timesteps=True, ): iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name ) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts( self.env, callback, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining( self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) self.train() callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name ) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: self.fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: # logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])) # logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("rollout/ep_reward_mean", safe_mean([ep_info['r'] for ep_info in self.ep_info_buffer])) logger.record("rollout/ep_len_mean", safe_mean([ep_info['l'] for ep_info in self.ep_info_buffer])) if len(self.specific_reward_info_buffer) > 0 and len(self.specific_reward_info_buffer[0]) > 0: logger.record('rollout/mimic_qpos_reward', safe_mean([specific_reward_info['mimic_qpos_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record('rollout/mimic_qvel_reward', safe_mean([specific_reward_info['mimic_qvel_reward'] for specific_reward_info in self.specific_reward_info_buffer])) #logger.record('rollout/mimic_ee_reward', safe_mean([specific_reward_info['mimic_ee_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record('rollout/mimic_body_orientation_reward', safe_mean([specific_reward_info['mimic_body_orientation_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record('rollout/mimic_body_reward', safe_mean([specific_reward_info['mimic_body_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record('rollout/mimic_body_vel_reward', safe_mean([specific_reward_info['mimic_body_vel_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record('rollout/mimic_contact_reward', safe_mean([specific_reward_info['mimic_contact_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record("time/fps", self.fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) self.train() callback.on_training_end() return self
def _dump_logs(self) -> None: """ Write log. """ try: fps = int(self.num_timesteps / (time.time() - self.start_time)) except ZeroDivisionError: warnings.warn("fps dump had zero division somehow, storing 0 instead.") fps = 0 logger.record("time/episodes", self._episode_num, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total timesteps", self.num_timesteps, exclude="tensorboard") if self.use_sde: logger.record("train/std", (self.actor.get_std()).mean().item()) if len(self.ep_success_buffer) > 0: logger.record("rollout/success rate", safe_mean(self.ep_success_buffer)) # Pass the number of timesteps for tensorboard logger.dump(step=self.num_timesteps)
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) for k in self.ep_info_buffer[0].keys(): if k not in "lrt": logger.record( f"progress/{k}", safe_mean([ ep_info[k] for ep_info in self.ep_info_buffer ])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) if iteration % (log_interval * 10) == 0: #save parameters every 10 log steps self.save('./interim_trained_models/') self.train() callback.on_training_end() return self
def train(args): cuda_availability = torch.cuda.is_available() print('\n*************************') print('`CUDA` available: {}'.format(cuda_availability)) print('Device specified: {}'.format(args.device)) print('*************************\n') # load the config of the trained model: with open(args.pretrained_output / "train_arguments.yaml") as yaml_data: pretrain_arguments = yaml.load(yaml_data, Loader=yaml.FullLoader) pretrained_model = algorithms[pretrain_arguments["alg"]].load( args.pretrained_output / "".join(pretrain_arguments["model_name"].split(".")[:-1]), device='cpu') # Prepare tensorboard logging log_name = '{}_{}_{}'.format(args.experiment_name, args.task_name, datetime.now().strftime('%d-%m_%H-%M-%S')) run_dir = args.tensorboard_log + "/" + log_name Path(run_dir).mkdir(parents=True, exist_ok=True) callbacks = [] # callbacks.append(CheckpointCallback( # save_freq=1000000, save_path=run_dir, name_prefix='rl_model')) callbacks.append(LoggingCallback(logpath=run_dir)) train_args = copy.copy(args) train_args.config = train_args.config.name pyaml.dump(train_args.__dict__, open(os.path.join(run_dir, 'train_arguments.yaml'), 'w')) assert args.task_name == pretrain_arguments[ "task_name"], "Envs must match for transfer learning" # Create the vectorized environment n_envs = train_args.n_envs # Number of processes to use env = make_vec_env(args.task_name, n_envs=n_envs) # define network architecture if "GnnPolicy" in args.policy and args.net_arch is not None: for net_arch_part in args.net_arch.keys(): for i, (layer_class_name, layer_size) in enumerate(args.net_arch[net_arch_part]): if hasattr(nn, layer_class_name): args.net_arch[net_arch_part][i] = (getattr( nn, layer_class_name), layer_size) elif hasattr(nerve_net_conv, layer_class_name): args.net_arch[net_arch_part][i] = (getattr( nerve_net_conv, layer_class_name), layer_size) else: def get_class(x): return globals()[x] c = get_class(layer_size) assert c is not None, f"Unkown layer class '{layer_class_name}'" args.net_arch[net_arch_part][i] = (c, layer_size) with open(os.path.join(run_dir, 'net_arch.txt'), 'w') as fp: fp.write(str(args.net_arch)) # Create the model alg_class = algorithms[args.alg] policy_kwargs = dict() if args.net_arch is not None: policy_kwargs['net_arch'] = args.net_arch if args.activation_fn is not None: policy_kwargs["activation_fn"] = activation_functions[ args.activation_fn] # policy_kwargs['device'] = args.device if args.device is not None else get_device('auto') if "GnnPolicy" in args.policy: policy_kwargs["mlp_extractor_kwargs"] = { "task_name": args.task_name, 'device': args.device, 'gnn_for_values': args.gnn_for_values, 'controller_option': controller_option[args.controller_option], 'embedding_option': embedding_option[args.embedding_option], 'root_option': root_option[args.root_option], 'drop_body_nodes': args.drop_body_nodes, 'use_sibling_relations': args.use_sibling_relations, 'xml_assets_path': args.xml_assets_path, 'policy_readout_mode': args.policy_readout_mode } alg_kwargs = args.__dict__.copy() alg_kwargs.pop("config", None) alg_kwargs.pop("task_name", None) alg_kwargs.pop("policy", None) alg_kwargs.pop("activation_fn", None) alg_kwargs.pop("gnn_for_values", None) alg_kwargs.pop("embedding_option", None) alg_kwargs.pop("controller_option", None) alg_kwargs.pop("root_option", None) alg_kwargs.pop("xml_assets_path", None) alg_kwargs.pop("alg", None) alg_kwargs.pop("net_arch", None) alg_kwargs.pop("experiment_name", None) alg_kwargs.pop("job_dir", None) alg_kwargs.pop("total_timesteps", None) alg_kwargs.pop("model_name", None) alg_kwargs.pop("n_envs", None) alg_kwargs.pop("drop_body_nodes", None) alg_kwargs.pop("use_sibling_relations", None) alg_kwargs.pop("experiment_name_suffix", None) alg_kwargs.pop("policy_readout_mode", None) alg_kwargs.pop("pretrained_output", None) model = alg_class( args.policy, env, verbose=1, # n_steps=args.n_steps, policy_kwargs=policy_kwargs, # device=args.device, # tensorboard_log=args.tensorboard_log, # learning_rate=args.learning_rate, # batch_size=args.batch_size, # n_epochs=args.n_epochs, **alg_kwargs) # model.learn(total_timesteps=args.total_timesteps, # callback=callbacks, # tb_log_name=log_name) # PPO Learn parameters: total_timesteps = args.total_timesteps callback = callbacks log_interval = 1 eval_env = make_vec_env(args.task_name, n_envs=1) eval_freq = 1e4 n_eval_episodes = 3 tb_log_name = log_name eval_log_path = None reset_num_timesteps = True ################################# ##### Custom Transfer Learn ##### ################################# iteration = 0 total_timesteps, callback = model._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) ### setup pretrained model ### pretrained_model.num_timesteps = 0 pretrained_model._episode_num = 0 pretrained_model._total_timesteps = total_timesteps pretrained_model.ep_info_buffer = deque(maxlen=100) pretrained_model.ep_success_buffer = deque(maxlen=100) pretrained_model._last_obs = model.env.reset() pretrained_model._last_dones = np.zeros((model.env.num_envs, ), dtype=bool) callback.on_training_start(locals(), globals()) while pretrained_model.num_timesteps < total_timesteps: continue_training = pretrained_model.collect_rollouts( model.env, callback, model.rollout_buffer, n_rollout_steps=model.n_steps) if continue_training is False: break iteration += 1 model._update_current_progress_remaining( pretrained_model.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(pretrained_model.num_timesteps / (time.time() - model.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(model.ep_info_buffer) > 0 and len( model.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in model.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in model.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - model.start_time), exclude="tensorboard") logger.record("time/total_timesteps", pretrained_model.num_timesteps, exclude="tensorboard") logger.dump(step=pretrained_model.num_timesteps) model.train() callback.on_training_end() model.save( os.path.join(args.tensorboard_log + "/" + log_name, args.model_name))
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 print('setup training') total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) print(f'start training, total timesteps is {total_timesteps}') while self.num_timesteps < total_timesteps: print(f'num timesteps: {self.num_timesteps}/{total_timesteps}') print(f'collect rollouts, rollout steps = {self.n_steps}') continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: print( 'stop training (only happens if callback on_step returns false)' ) break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos print('display training infos') # print(f'len(self.ep_info_buffer)={len(self.ep_info_buffer)}, len(self.ep_info_buffer[0])={len(self.ep_info_buffer[0])}') if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) print('train') self.train() callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "PPO", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: """Replay buffer size""" ### No need to use larger buffer, because that doesn't solve the catastrophic forgetting problem. ### For this experiment, just count the best score is enough. # Determine buffer size using safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]) # I want it to be stable when learned walking. # Start with small buffer, once # ep_len_mean = safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]) # if ep_len_mean>=1000: # self.use_small_buffer = False if not args.single and self.use_small_buffer: output( f"Collect rollouts for {self.n_steps//self.env.num_envs} steps.", 2) continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer_small, n_rollout_steps=self.n_steps // self.env.num_envs) else: output(f"Collect rollouts for {self.n_steps} steps.", 2) continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) self.train() callback.on_training_end() return self
def train(self) -> None: """ Update policy using the currently gathered rollout buffer. """ # Update optimizer learning rate self._update_learning_rate(self.policy.optimizer) # Compute current clip range clip_range = self.clip_range(self._current_progress_remaining) # Optional: clip range for the value function if self.clip_range_vf is not None: clip_range_vf = self.clip_range_vf(self._current_progress_remaining) entropy_losses, all_kl_divs = [], [] pg_losses, value_losses = [], [] clip_fractions = [] # train for gradient_steps epochs for epoch in range(self.n_epochs): approx_kl_divs = [] # Do a complete pass on the rollout buffer for rollout_data in self.rollout_buffer.get(self.batch_size): actions = rollout_data.actions if isinstance(self.action_space, spaces.Discrete): # Convert discrete action from float to long actions = rollout_data.actions.long().flatten() # Re-sample the noise matrix because the log_std has changed # TODO: investigate why there is no issue with the gradient # if that line is commented (as in SAC) if self.use_sde: self.policy.reset_noise(self.batch_size) values, log_prob, entropy = self.policy.evaluate_actions(rollout_data.observations, actions) values = values.flatten() # Normalize advantage advantages = rollout_data.advantages advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) # ratio between old and new policy, should be one at the first iteration ratio = th.exp(log_prob - rollout_data.old_log_prob) # clipped surrogate loss policy_loss_1 = advantages * ratio policy_loss_2 = advantages * th.clamp(ratio, 1 - clip_range, 1 + clip_range) policy_loss = -th.min(policy_loss_1, policy_loss_2).mean() # Logging pg_losses.append(policy_loss.item()) clip_fraction = th.mean((th.abs(ratio - 1) > clip_range).float()).item() clip_fractions.append(clip_fraction) if self.clip_range_vf is None: # No clipping values_pred = values else: # Clip the different between old and new value # NOTE: this depends on the reward scaling values_pred = rollout_data.old_values + th.clamp( values - rollout_data.old_values, -clip_range_vf, clip_range_vf ) # Value loss using the TD(gae_lambda) target value_loss = F.mse_loss(rollout_data.returns, values_pred) value_losses.append(value_loss.item()) # Entropy loss favor exploration if entropy is None: # Approximate entropy when no analytical form entropy_loss = -th.mean(-log_prob) else: entropy_loss = -th.mean(entropy) entropy_losses.append(entropy_loss.item()) loss = policy_loss + self.vf_coef * value_loss # Optimization step # # Critic # self.policy.critic_optimizer.zero_grad() # value_loss.backward() # # Clip grad norm # th.nn.utils.clip_grad_norm_(self.policy.value_net.parameters(), self.max_grad_norm) # self.policy.critic_optimizer.step() # # Actor # self.policy.optimizer.zero_grad() # policy_loss.backward() # # Clip grad norm # th.nn.utils.clip_grad_norm_(self.policy.action_net.parameters(), self.max_grad_norm) # self.policy.optimizer.step() # Actor and Critic self.policy.optimizer.zero_grad() loss.backward() # Clip grad norm th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) self.policy.optimizer.step() approx_kl_divs.append(th.mean(rollout_data.old_log_prob - log_prob).detach().cpu().numpy()) all_kl_divs.append(np.mean(approx_kl_divs)) if self.target_kl is not None and np.mean(approx_kl_divs) > 1.5 * self.target_kl: print(f"Early stopping at step {epoch} due to reaching max kl: {np.mean(approx_kl_divs):.2f}") break self._n_updates += self.n_epochs explained_var = explained_variance(self.rollout_buffer.returns.flatten(), self.rollout_buffer.values.flatten()) # Logs logger.record("train/entropy_loss", np.mean(entropy_losses)) logger.record("train/policy_gradient_loss", np.mean(pg_losses)) logger.record("train/value_loss", np.mean(value_losses)) logger.record("train/approx_kl", np.mean(approx_kl_divs)) logger.record("train/clip_fraction", np.mean(clip_fractions)) logger.record("train/loss", loss.item()) logger.record("train/explained_variance", explained_var) if hasattr(self.policy, "log_std"): logger.record("train/std", th.exp(self.policy.log_std).mean().item()) logger.record("train/n_updates", self._n_updates, exclude="tensorboard") logger.record("train/clip_range", clip_range) if self.clip_range_vf is not None: logger.record("train/clip_range_vf", clip_range_vf) if self.wandb_use: if (self._n_updates % 10 == 0): t_start = time.time() wandb_dict = dict() # wandb_dict["Mean Reward"] = np.mean(true_reward) wandb_dict["serial_timesteps"] = self._n_updates * self.n_steps wandb_dict["n_updates"] = self._n_updates wandb_dict["total_timesteps"] = self.num_timesteps wandb_dict["fps"] = self.fps if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: wandb_dict["ep_reward_mean"] = safe_mean([ep_info['r'] for ep_info in self.ep_info_buffer]) wandb_dict["ep_len_mean"] = safe_mean([ep_info['l'] for ep_info in self.ep_info_buffer]) if len(self.specific_reward_info_buffer) > 0 : wandb_dict["mimic_qpos_reward"] = safe_mean([specific_reward_info['mimic_qpos_reward'] for specific_reward_info in self.specific_reward_info_buffer]) wandb_dict["mimic_qvel_reward"] = safe_mean([specific_reward_info['mimic_qvel_reward'] for specific_reward_info in self.specific_reward_info_buffer]) wandb_dict["mimic_body_orientation_reward"] = safe_mean([specific_reward_info['mimic_body_orientation_reward'] for specific_reward_info in self.specific_reward_info_buffer]) wandb_dict["mimic_body_reward"] = safe_mean([specific_reward_info['mimic_body_reward'] for specific_reward_info in self.specific_reward_info_buffer]) wandb_dict["mimic_body_vel_reward"] = safe_mean([specific_reward_info['mimic_body_vel_reward'] for specific_reward_info in self.specific_reward_info_buffer]) wandb_dict["mimic_contact_reward"] = safe_mean([specific_reward_info['mimic_contact_reward'] for specific_reward_info in self.specific_reward_info_buffer]) wandb_dict["time_elapsed"] = t_start - self.t_first_start wandb_dict["train/entropy_loss"] = np.mean(entropy_losses) wandb_dict["train/policy_gradient_loss"] = np.mean(pg_losses) wandb_dict["train/value_loss"] = np.mean(value_losses) wandb_dict["train/approx_kl"] = np.mean(approx_kl_divs) wandb_dict["train/clip_fraction"] = np.mean(clip_fractions) wandb_dict["train/loss"] = loss.item() wandb_dict["train/explained_variance"]= explained_var wandb.log(wandb_dict)
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) from stable_baselines3.common.utils import obs_as_tensor, safe_mean import time while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int((self.num_timesteps - self._num_timesteps_at_start) / (time.time() - self.start_time)) self.logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: self.logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) self.logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) self.logger.record("time/fps", fps) self.logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") # [RLA] set timesteps time_step_holder.set_time(self.num_timesteps) self.logger.dump() self.train() callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: for partner_idx in range(self.policy.num_partners): try: self.env.envs[0].switch_to_env(partner_idx) except: pass continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer[partner_idx], n_rollout_steps=self.n_steps, partner_idx=partner_idx) #continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer[partner_idx], n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) self.train() callback.on_training_end() return self
def main(): #check for uncommited changes commit_check() ##setup args parser = argparse.ArgumentParser( description='Reward learning from preferences') parser.add_argument('--env_type', type=str, default='atari') parser.add_argument('--env_name', type=str, default='BeamRider') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=1) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--log_dir', type=str, default='LOGS') parser.add_argument('--log_prefix', type=str, default='') parser.add_argument('--log_name', type=str, default='') parser.add_argument('--cpu_buffer', dest='on_cuda', action='store_false', help='whether to store buffet on cpu or GPU \ by default requires up to 8GB memory on GPU' ) parser.add_argument('--resume_training', action='store_true') parser.add_argument('--init_buffer_size', type=int, default=500) parser.add_argument( '--init_train_size', type=int, default=10**5, help= 'number of labels to process during initial training of the reward model' ) parser.add_argument( '--clip_size', type=int, default=25, help='number of frames in each clip generated for comparison') parser.add_argument('--total_timesteps', type=int, default=5 * 10**7, help='total number of RL timesteps to be taken') parser.add_argument( '--n_labels', type=int, default=6800, help="total number of labels to collect throughout the training") parser.add_argument('--steps_per_iter', type=int, default=5 * 10**4, help="number of RL steps taken on each iteration") parser.add_argument( '--pairs_per_iter', type=int, default=5 * 10**3, help='number of labels the reward model is trained on each iteration') parser.add_argument('--pairs_in_batch', type=int, default=16, help='batch size for reward model training') parser.add_argument('--l2', type=float, default=0.0001, help='initial l2 regularization for a reward model') parser.add_argument('--adaptive', dest='adaptive', action='store_true') parser.add_argument('--no-adaptive', dest='adaptive', action='store_false') parser.set_defaults(adaptive=True) parser.add_argument('--dropout', type=float, default=0.5) args = parser.parse_args() args.ppo_kwargs = dict(verbose=1, n_steps=256, noptepochs=3, nminibatches=8) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(f'\n Using {device} for training') run_dir, monitor_dir, video_dir = setup_logging(args) global LOG_TIME LOG_TIME = os.path.join(run_dir, "TIME_LOG.txt") ### Initializing objects ### # If resuming some earlier training run - load stored objects if args.resume_training: args = load_args(args) reward_model, policy, data_buffer, i_num = load_state(run_dir) atari_name = args.env_name + "NoFrameskip-v4" venv_fn = lambda: make_atari_continuous(atari_name, n_envs=16) annotation_env = make_atari_continuous(atari_name, n_envs=16) annotation_env.reset() iter_time = 0 # In case this is a fresh experiment - initialize fresh objects if not args.resume_training: store_args(args, run_dir) policy = A2C('CnnPolicy', venv_fn(), verbose=1, tensorboard_log="TB_LOGS", ent_coef=0.01, learning_rate=0.0007, policy_kwargs={ "optimizer_class": torch.optim.Adam, "optimizer_kwargs": { "eps": 1e-5, "betas": [.99, .999] } }) reward_model = RewardNet(l2=args.l2, dropout=args.dropout, env_type=args.env_type) data_buffer = AnnotationBuffer() # initializing RM optimizer rm_optimizer = optim.Adam(reward_model.parameters(), lr=0.0003, weight_decay=reward_model.l2) #creating the environment with reward replaced by the prediction from reward_model reward_model.to(device) proxy_reward_function = lambda x: reward_model( torch.from_numpy(x).float().to(device)) proxy_reward_venv = Vec_reward_wrapper(venv_fn(), proxy_reward_function) # resetting the environment to avoid raising error from reset_num_timesteps proxy_reward_venv.reset() policy.set_env(proxy_reward_venv) # eval_env_fn = lambda: make_atari_default(atari_name, n_envs=16, seed = 0, vec_env_cls = SubprocVecEnv) # video_env_fn= lambda: make_atari_default(atari_name, vec_env_cls = DummyVecEnv) # in case this is a fresh run, collect init_buffer_size samples to AnnotationBuffer # and train the reward model on init_train_size number of samples with replacement if not args.resume_training: t_start = time.time() print(f'================== Initial iter ====================') annotations = collect_annotations(annotation_env, policy, args.init_buffer_size, args.clip_size, args.on_cuda) data_buffer.add(annotations) print(f'Buffer size = {data_buffer.current_size}') reward_model, rm_optimizer, rm_train_stats = train_reward( reward_model, rm_optimizer, args.adaptive, data_buffer, args.init_train_size, args.pairs_in_batch) # this callback adds values to TensorBoard logs for easier plotting reward_model.eval() callback = TensorboardCallback( (data_buffer.total_labels, data_buffer.loss_lb, iter_time, rm_train_stats)) policy = train_policy(policy, args.steps_per_iter, 0, args.log_name, callback) save_state(run_dir, 0, reward_model, policy, data_buffer) true_performance = safe_mean( [ep_info["r"] for ep_info in policy.ep_info_buffer]) t_finish = time.time() iter_time = t_finish - t_start log_iter(run_dir, args.steps_per_iter, data_buffer, true_performance, 0, rm_train_stats, iter_time) print( f'Iteration took {time.gmtime(t_finish - t_start).tm_min} min {time.gmtime(t_finish - t_start).tm_sec} sec' ) # i_num is the number of training iterations taken i_num = 1 num_iters = int(args.total_timesteps / args.steps_per_iter) # calculating the initial number of pairs to collect num_pairs = init_num_pairs = round( (args.n_labels - args.init_buffer_size) / 0.239 / num_iters) print('init_num_pairs = {}'.format(init_num_pairs)) for i in range(i_num, num_iters): t_start = time.time() print(f'================== iter : {i} ====================') rl_steps = i * args.steps_per_iter # decaying the number of pairs to collect num_pairs = round(init_num_pairs / (rl_steps / (args.total_timesteps / 10) + 1)) annotations = collect_annotations(annotation_env, policy, num_pairs, args.clip_size, args.on_cuda) data_buffer.add(annotations) print(f'Buffer size = {data_buffer.current_size}') reward_model, rm_optimizer, rm_train_stats = train_reward( reward_model, rm_optimizer, args.adaptive, data_buffer, args.pairs_per_iter, args.pairs_in_batch) #TODO : pretify passing data to callback callback = TensorboardCallback( (data_buffer.total_labels, data_buffer.loss_lb, iter_time, rm_train_stats)) policy = train_policy(policy, args.steps_per_iter, rl_steps, args.log_name, callback) # storing the state every 1M steps # this assumes that steps_per_iter devides 10**6 if rl_steps % (10**6) == 0: save_state(run_dir, i, reward_model, policy, data_buffer) # record_video(policy, video_env_fn(), video_dir, 4000, f"{i}_ITER00_{args.env_name}") # true_performance = eval_policy(venv_fn(), policy, n_eval_episodes=50) # proxy_performance = eval_policy(test_env, policy, n_eval_episodes=50) true_performance = safe_mean( [ep_info["r"] for ep_info in policy.ep_info_buffer]) # print(f'True policy preformance = {true_performance}') # print(f'Proxy policy preformance = {proxy_performance}') t_finish = time.time() iter_time = t_finish - t_start log_iter(run_dir, rl_steps, data_buffer, true_performance, 0, rm_train_stats, iter_time) if LOG_TIME: with open(LOG_TIME, 'a') as f: f.write( f'Iteration took {time.gmtime(iter_time).tm_min} min {time.gmtime(iter_time).tm_sec} sec\n' ) f.write( f'================== iter : {i+1} ====================\n') else: print( f'Iteration took {time.gmtime(iter_time).tm_min} min {time.gmtime(iter_time).tm_sec} sec' )
def main(): def env_contr(): return gym.make("CartPole-v0") # # env = multiwalker_v0.env() # env = pad_observations(env) # env = pad_action_space(env) # markov_env = aec_to_markov(env) # venv = MarkovVectorEnv(markov_env) # return venv n_envs = 6 # def nest_env_const(): # cat = ConcatVecEnv([env_contr]*envs_per_proc) # return cat example_env = env_contr() num_envs = n_envs * 1 #example_env.num_envs #cat = ProcConcatVec([nest_env_const]*n_procs,example_env.observation_space, example_env.action_space, num_envs) cat = MakeCPUAsyncConstructor(0)([env_contr] * n_envs, example_env.observation_space, example_env.action_space) #, num_envs) cat = VecEnvWrapper(cat) env = cat policy = "MlpPolicy" logger = make_logger("log") stable_baselines3.common.logger.Logger.CURRENT = logger a2c = PPO(policy, cat, n_steps=4, batch_size=6, n_epochs=3) print(type(a2c.env)) #a2c.learn(1000000) total_timesteps, callback = a2c._setup_learn(10000, None, None, None, n_eval_episodes=5, reset_num_timesteps=None, tb_log_name="PPo") #total_timesteps = 100 iteration = 0 log_interval = 1 for i in range(total_timesteps): continue_training = a2c.collect_rollouts(env, callback, a2c.rollout_buffer, n_rollout_steps=a2c.n_steps) print(a2c.ep_info_buffer) if continue_training is False: break iteration += 1 a2c._update_current_progress_remaining(a2c.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(a2c.num_timesteps / (time.time() - a2c.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") print(a2c.ep_info_buffer) if len(a2c.ep_info_buffer) > 0 and len(a2c.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in a2c.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in a2c.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - a2c.start_time), exclude="tensorboard") logger.record("time/total_timesteps", a2c.num_timesteps, exclude="tensorboard") logger.dump(step=a2c.num_timesteps) a2c.train()
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, param_noise: bool = False, sigma: float = 0.1, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: #during rollout we collect batches of states and rewards continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps, param_noise=param_noise, sigma=sigma) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) # during training gradient descent is done self.train(param_noise, sigma) if param_noise: sigma = self.update_sigma(sigma) # print("current_sigma") # print(sigma) callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, parameter_noise: bool = False, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) #initiatilizing value of noise std current_sigma = 1.0 while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps, parameter_noise=parameter_noise, sigma=0.5) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) self.train() if parameter_noise: states = self.rollout_buffer.observations states = th.tensor(states) actions_unnoisy, values_unnoisy, log_prob_unnoisy = self.policy( states, parameter_noise=False) actions_noisy, values_noisy, log_prob_noisy = self.policy( states, parameter_noise=True, sigma=current_sigma) distance = th.sum((actions_unnoisy - actions_noisy)**2)**0.5 distance_threshold = 1 sigma_scalefactor = 1.01 if distance > distance_threshold: current_sigma /= sigma_scalefactor else: current_sigma *= sigma_scalefactor callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: rollout = self.collect_rollouts( self.env, n_episodes=-1, n_steps=1, action_noise=self.action_noise, callback=callback, learning_starts=self.learning_starts, replay_buffer=self.replay_buffer, log_interval=log_interval, ) if rollout.continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: # If no `gradient_steps` is specified, # do as many gradients steps as steps performed during the rollout self.train(gradient_steps=1, batch_size=self.batch_size) callback.on_training_end() return self
def collect_rollouts( self, # noqa: C901 env: VecEnv, # Type hint as string to avoid circular import callback: 'BaseCallback', n_episodes: int = 1, n_steps: int = -1, action_noise: Optional[ActionNoise] = None, learning_starts: int = 0, replay_buffer: Optional[ReplayBuffer] = None, log_interval: Optional[int] = None) -> RolloutReturn: """ Collect experiences and store them into a ReplayBuffer. :param env: (VecEnv) The training environment :param callback: (BaseCallback) Callback that will be called at each step (and at the beginning and end of the rollout) :param n_episodes: (int) Number of episodes to use to collect rollout data You can also specify a ``n_steps`` instead :param n_steps: (int) Number of steps to use to collect rollout data You can also specify a ``n_episodes`` instead. :param action_noise: (Optional[ActionNoise]) Action noise that will be used for exploration Required for deterministic policy (e.g. TD3). This can also be used in addition to the stochastic policy for SAC. :param learning_starts: (int) Number of steps before learning for the warm-up phase. :param replay_buffer: (ReplayBuffer) :param log_interval: (int) Log data every ``log_interval`` episodes :return: (RolloutReturn) """ episode_rewards, total_timesteps = [], [] total_steps, total_episodes = 0, 0 assert isinstance(env, VecEnv), "You must pass a VecEnv" assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment" if n_episodes > 0 and n_steps > 0: # Note we are refering to the constructor arguments # that are named `train_freq` and `n_episodes_rollout` # but correspond to `n_steps` and `n_episodes` here warnings.warn( "You passed a positive value for `train_freq` and `n_episodes_rollout`." "Please make sure this is intended. " "The agent will collect data by stepping in the environment " "until both conditions are true: " "`number of steps in the env` >= `train_freq` and " "`number of episodes` > `n_episodes_rollout`") if self.use_sde: self.actor.reset_noise() callback.on_rollout_start() continue_training = True while total_steps < n_steps or total_episodes < n_episodes: done = False episode_reward, episode_timesteps = 0.0, 0 while not done: if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.actor.reset_noise() # Select action randomly or according to policy if self.num_timesteps < learning_starts and not ( self.use_sde and self.use_sde_at_warmup): # Warmup phase unscaled_action = np.array([self.action_space.sample()]) else: # Note: we assume that the policy uses tanh to scale the action # We use non-deterministic action in the case of SAC, for TD3, it does not matter unscaled_action, _ = self.predict(self._last_obs, deterministic=False) # Rescale the action from [low, high] to [-1, 1] if isinstance(self.action_space, gym.spaces.Box): scaled_action = self.policy.scale_action(unscaled_action) # Add noise to the action (improve exploration) if action_noise is not None: # NOTE: in the original implementation of TD3, the noise was applied to the unscaled action # Update(October 2019): Not anymore scaled_action = np.clip(scaled_action + action_noise(), -1, 1) # We store the scaled action in the buffer buffer_action = scaled_action action = self.policy.unscale_action(scaled_action) else: # Discrete case, no need to normalize or clip buffer_action = unscaled_action action = buffer_action # Rescale and perform action new_obs, reward, done, infos = env.step(action) # Only stop training if return value is False, not when it is None. if callback.on_step() is False: return RolloutReturn(0.0, total_steps, total_episodes, continue_training=False) episode_reward += reward # Retrieve reward and episode length if using Monitor wrapper self._update_info_buffer(infos, done) # Store data in replay buffer if replay_buffer is not None: # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs() reward_ = self._vec_normalize_env.get_original_reward() else: # Avoid changing the original ones self._last_original_obs, new_obs_, reward_ = self._last_obs, new_obs, reward replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done) self._last_obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: self._last_original_obs = new_obs_ self.num_timesteps += 1 episode_timesteps += 1 total_steps += 1 if 0 < n_steps <= total_steps: break if done: total_episodes += 1 self._episode_num += 1 episode_rewards.append(episode_reward) total_timesteps.append(episode_timesteps) if action_noise is not None: action_noise.reset() # Log training infos if log_interval is not None and self._episode_num % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/episodes", self._episode_num, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( 'rollout/ep_rew_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buffer ])) logger.record( 'rollout/ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buffer ])) logger.record("time/fps", fps) logger.record('time/time_elapsed', int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total timesteps", self.num_timesteps, exclude="tensorboard") if self.use_sde: logger.record("train/std", (self.actor.get_std()).mean().item()) if len(self.ep_success_buffer) > 0: logger.record('rollout/success rate', safe_mean(self.ep_success_buffer)) # Pass the number of timesteps for tensorboard logger.dump(step=self.num_timesteps) mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0 callback.on_rollout_end() return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training)