def _sanity_checks(self): """ Do assertions...""" assert self.steps_per_epoch % mpi_tools.num_procs() == 0 assert self.max_ep_len <= self.local_steps_per_epoch, \ f'Reduce number of cores ({mpi_tools.num_procs()}) or increase ' \ f'batch size {self.steps_per_epoch}.' assert self.train_pi_iterations > 0 assert self.train_v_iterations > 0 assert isinstance(self.env, gym.Env), 'Env is not the expected type.'
def check_distributed_parameters(self) -> None: """Check if parameters are synchronized across all processes.""" if mpi_tools.num_procs() > 1: self.logger.log( 'Check if distributed parameters are synchronous..') modules = {'Policy': self.ac.pi.net, 'Value': self.ac.v.net} for key, module in modules.items(): flat_params = U.get_flat_params_from(module).numpy() global_min = mpi_tools.mpi_min(np.sum(flat_params)) global_max = mpi_tools.mpi_max(np.sum(flat_params)) assert np.allclose(global_min, global_max), f'{key} not synced.'
def _init_mpi(self) -> None: """ Initialize MPI specifics Returns ------- """ if mpi_tools.num_procs() > 1: # Avoid slowdowns from PyTorch + MPI combo. mpi_tools.setup_torch_for_mpi() dt = time.time() self.logger.log('INFO: Sync actor critic parameters') # Sync params across cores: only once necessary, grads are averaged! mpi_tools.sync_params(self.ac) self.logger.log(f'Done! (took {time.time()-dt:0.3f} sec.)')
def update(self, x) -> None: """ Update internals incrementally. Note: works for both vector and matrix inputs. MPI implementation according to Chan et al.[10]; see: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm """ x = self._convert_to_torch(x) # ==== Input checks msg = f'Expected dim in [1, 2], but got dim={len(x.shape)}.' assert len(x.shape) == 2 or len(x.shape) == 1, msg if self.shape[0] > 1: # expect matrix inputs msg = f'Expected obs_dim={self.shape[0]} but got: {x.shape[1]}' assert len(x.shape) == 2 and x.shape[1] == self.shape[0], msg if self.shape[0] == 1: assert len(x.shape) == 1, f'Expected dim=1 but got: {x.shape}' # reshape is necessary since mean operator reduces vector dim by one x = x.view((-1, 1)) n_B = x.shape[0] * mpi_tools.num_procs() # get batch size n_A = self.count.clone() n_AB = self.count + n_B batch_mean = torch.mean(x, dim=0) # 1) Calculate mean and average batch mean across processes mpi_tools.mpi_avg_torch_tensor(batch_mean) delta = batch_mean - self.mean mean_new = self.mean + delta * n_B / n_AB # 2) Determine variance and sync across processes diff = x - mean_new batch_var = torch.mean(diff**2, dim=0) mpi_tools.mpi_avg_torch_tensor(batch_var) # Update running terms M2_A = n_A * self.var M2_B = n_B * batch_var ratio = n_A * n_B / n_AB M2_AB = M2_A + M2_B + delta**2 * ratio # 3) Update parameters - access internal values with data attribute self.mean.data = mean_new self.count.data = n_AB new_var = M2_AB / n_AB self.std.data = torch.sqrt(new_var)
def check_alg(alg_name, env_id, cores): """" Run one epoch update with algorithm.""" defaults = U.get_defaults_kwargs(alg=alg_name, env_id=env_id) defaults['epochs'] = 1 defaults['num_mini_batches'] = 4 defaults['steps_per_epoch'] = 1000 * mpi_tools.num_procs() defaults['verbose'] = False print(defaults['steps_per_epoch']) defaults['logger_kwargs'] = setup_logger_kwargs(exp_name='unittest', seed=0, base_dir='/var/tmp/', datestamp=True, level=0, use_tensor_board=False, verbose=False) alg = U.get_alg_class(alg_name, env_id, **defaults) # sanity check of argument passing assert alg.alg == alg_name, f'Expected {alg_name} but got {alg.alg}' # return learn_fn(env_id, **defaults) ac, env = alg.learn() return ac, env
def eval(self, env, ac, num_evaluations): """ Evaluate actor critic module for given number of evaluations. """ self.ac = ac self.ac.eval() # disable exploration noise if isinstance(env, gym.Env): self.env = env elif isinstance(env, str): self.env = gym.make(env) else: raise TypeError('Env is not of type: str, gym.Env') size = mpi_tools.num_procs() num_local_evaluations = num_evaluations // size returns = np.zeros(num_local_evaluations, dtype=np.float32) costs = np.zeros(num_local_evaluations, dtype=np.float32) ep_lengths = np.zeros(num_local_evaluations, dtype=np.float32) for i in range(num_local_evaluations): returns[i], ep_lengths[i], costs[i] = self.eval_once() # Gather returns from all processes # Note: only root process owns valid data... returns = list(mpi_tools.gather_and_stack(returns)) costs = list(mpi_tools.gather_and_stack(costs)) # now write returns as column into output file... if mpi_tools.proc_id() == 0: self.write_to_file(self.ret_file, contents=returns) print('Saved to:', os.path.join(self.log_dir, self.ret_file_name)) if self.log_costs: self.write_to_file(self.costs_file, contents=costs) print(f'Mean Ret: { np.mean(returns)} \t' f'Mean EpLen: {np.mean(ep_lengths)} \t' f'Mean Costs: {np.mean(costs)}') self.ac.train() # back to train mode return np.array(returns), np.array(ep_lengths), np.array(costs)
def __init__( self, actor: str, ac_kwargs: dict, env_id: str, epochs: int, logger_kwargs: dict, adv_estimation_method: str = 'gae', alg='iwpg', check_freq: int = 25, entropy_coef: float = 0.01, gamma: float = 0.99, lam: float = 0.95, # GAE scalar lam_c: float = 0.95, # GAE scalar for cost estimation max_ep_len: int = 1000, max_grad_norm: float = 0.5, num_mini_batches: int = 16, # used for value network training optimizer: str = 'Adam', # policy optimizer pi_lr: float = 3e-4, steps_per_epoch: int = 32 * 1000, # number global steps per epoch target_kl: float = 0.01, train_pi_iterations: int = 80, train_v_iterations: int = 5, trust_region='plain', # used for easy filtering in plot utils use_cost_value_function: bool = False, use_entropy: bool = False, use_exploration_noise_anneal: bool = False, use_kl_early_stopping: bool = False, use_linear_lr_decay: bool = True, use_max_grad_norm: bool = False, use_reward_scaling: bool = True, use_reward_penalty: bool = False, use_shared_weights: bool = False, use_standardized_advantages: bool = False, use_standardized_obs: bool = True, verbose: bool = True, vf_lr: float = 1e-3, weight_initialization: str = 'kaiming_uniform', save_freq: int = 10, seed: int = 0, video_freq: int = -1, # set to positive integer for video recording **kwargs # use to log parameters from child classes ): """ Parameters ---------- actor ac_kwargs env_id epochs logger_kwargs adv_estimation_method alg check_freq entropy_coef gamma lam lam_c max_ep_len max_grad_norm num_mini_batches optimizer pi_lr steps_per_epoch target_kl train_pi_iterations train_v_iterations trust_region use_cost_value_function use_entropy use_exploration_noise_anneal use_kl_early_stopping use_linear_lr_decay use_max_grad_norm use_reward_scaling use_reward_penalty use_shared_weights use_standardized_advantages use_standardized_obs verbose vf_lr weight_initialization save_freq seed video_freq kwargs """ # Environment calls # TODO: call gym.make with **kwargs (to allow customization) self.env = env = gym.make(env_id) if isinstance(env_id, str) else env_id # Collect information from environment if it has an time wrapper if hasattr(self.env, '_max_episode_steps'): max_ep_len = self.env._max_episode_steps self.adv_estimation_method = adv_estimation_method self.alg = alg self.check_freq = check_freq self.entropy_coef = entropy_coef if use_entropy else 0.0 self.epoch = 0 # iterated in learn method self.epochs = epochs self.lam = lam self.local_steps_per_epoch = steps_per_epoch // mpi_tools.num_procs() self.logger_kwargs = logger_kwargs self.max_ep_len = max_ep_len self.max_grad_norm = max_grad_norm self.num_mini_batches = num_mini_batches self.pi_lr = pi_lr self.save_freq = save_freq self.seed = seed self.steps_per_epoch = steps_per_epoch self.target_kl = target_kl self.train_pi_iterations = train_pi_iterations self.train_v_iterations = train_v_iterations self.use_cost_value_function = use_cost_value_function self.use_exploration_noise_anneal = use_exploration_noise_anneal self.use_kl_early_stopping = use_kl_early_stopping self.use_linear_lr_decay = use_linear_lr_decay self.use_max_grad_norm = use_max_grad_norm self.use_reward_penalty = use_reward_penalty self.use_reward_scaling = use_reward_scaling self.use_standardized_obs = use_standardized_obs self.use_standardized_advantages = use_standardized_advantages self.video_freq = video_freq self.vf_lr = vf_lr # ==== Call assertions.... self._sanity_checks() # === Set up logger and save configuration to disk # get local parameters before logger instance to avoid unnecessary print self.params = locals() self.logger = self._init_logger() self.logger.save_config(self.params) # === Seeding seed += 10000 * mpi_tools.proc_id() torch.manual_seed(seed) np.random.seed(seed) self.env.seed(seed=seed) # === Setup actor-critic module self.ac = core.ActorCriticWithCosts( actor_type=actor, observation_space=env.observation_space, action_space=env.action_space, use_standardized_obs=use_standardized_obs, use_scaled_rewards=use_reward_scaling, use_shared_weights=use_shared_weights, weight_initialization=weight_initialization, ac_kwargs=ac_kwargs) # === set up MPI specifics self._init_mpi() # === Set up experience buffer self.buf = core.Buffer( actor_critic=self.ac, obs_dim=env.observation_space.shape, act_dim=env.action_space.shape, size=self.local_steps_per_epoch, gamma=gamma, lam=lam, adv_estimation_method=adv_estimation_method, use_scaled_rewards=use_reward_scaling, standardize_env_obs=use_standardized_obs, standardize_advantages=use_standardized_advantages, lam_c=lam_c, use_reward_penalty=use_reward_penalty, ) # Set up optimizers for policy and value function self.pi_optimizer = core.get_optimizer(optimizer, module=self.ac.pi, lr=pi_lr) self.vf_optimizer = core.get_optimizer('Adam', module=self.ac.v, lr=vf_lr) if use_cost_value_function: self.cf_optimizer = core.get_optimizer('Adam', module=self.ac.c, lr=self.vf_lr) # Set up video recorder self.recorder = self._init_video_recorder() # setup scheduler for policy learning rate decay self.scheduler = self._init_learning_rate_scheduler() # Set up model saving self.logger.setup_torch_saver(self.ac) self.logger.torch_save() # setup statistics self.start_time = time.time() self.epoch_time = time.time() self.loss_pi_before = 0.0 self.loss_v_before = 0.0 self.loss_c_before = 0.0 self.logger.log('Start with training.')