def __init__(self, env, policy, baseline, clip_coef=0.2, epochs=10, mb_size=64, learn_rate=3e-4, seed=0, save_logs=False): self.env = env self.policy = policy self.baseline = baseline self.learn_rate = learn_rate self.seed = seed self.save_logs = save_logs self.clip_coef = clip_coef self.epochs = epochs self.mb_size = mb_size self.running_score = None if save_logs: self.logger = DataLog() self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=learn_rate)
def __init__(self, env, policy, baseline, kl_dist=0.01, FIM_invert_args={'iters': 10, 'damping': 1e-4}, hvp_sample_frac=1.0, seed=123, save_logs=False, normalized_step_size=0.01, **kwargs ): """ All inputs are expected in mjrl's format unless specified :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance :param kl_dist: desired KL distance between steps. Overrides normalized_step_size. :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well) :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow) :param seed: random seed """ self.env = env self.policy = policy self.baseline = baseline self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size self.seed = seed self.save_logs = save_logs self.FIM_invert_args = FIM_invert_args self.hvp_subsample = hvp_sample_frac self.running_score = None if save_logs: self.logger = DataLog()
def __init__(self, expert_paths, policy, epochs=5, batch_size=64, lr=1e-3, optimizer=None): self.policy = policy self.expert_paths = expert_paths self.epochs = epochs self.mb_size = batch_size self.logger = DataLog() # get transformations observations = np.concatenate( [path["observations"] for path in expert_paths]) actions = np.concatenate([path["actions"] for path in expert_paths]) in_shift, in_scale = np.mean(observations, axis=0), np.std(observations, axis=0) out_shift, out_scale = np.mean(actions, axis=0), np.std(actions, axis=0) # set scalings in the target policy self.policy.model.set_transformations(in_shift, in_scale, out_shift, out_scale) self.policy.old_model.set_transformations(in_shift, in_scale, out_shift, out_scale) # construct optimizer self.optimizer = torch.optim.Adam( self.policy.trainable_params, lr=lr) if optimizer is None else optimizer
def __init__( self, env, policy, baseline, demo_paths=None, normalized_step_size=0.01, FIM_invert_args={ 'iters': 10, 'damping': 1e-4 }, hvp_sample_frac=1.0, seed=None, save_logs=False, kl_dist=None, lam_0=1.0, # demo coef lam_1=0.95, # decay coef ): self.env = env self.policy = policy self.baseline = baseline self.kl_dist = kl_dist if kl_dist is not None else 0.5 * normalized_step_size self.seed = seed self.save_logs = save_logs self.FIM_invert_args = FIM_invert_args self.hvp_subsample = hvp_sample_frac self.running_score = None self.demo_paths = demo_paths self.lam_0 = lam_0 self.lam_1 = lam_1 self.iter_count = 0.0 if save_logs: self.logger = DataLog()
def __init__(self, env, policy, baseline, demo_paths=None, seed=123, save_logs=False, irl_model_wt=1.0, irl_batch_size=128, train_irl=True, irl_model=None, no_reward=True, discrim_train_itrs=20, entropy_weight=0.1, augmentation=False, lower_lr_on_main_loop_percentage=None, discr_lr=1e-3, call_super=False, **kwargs): super().__init__(env=env, policy=policy, baseline=baseline, demo_paths=demo_paths, save_logs=save_logs, **kwargs) self.env = env self.policy = policy self.baseline = baseline self.seed = seed self.save_logs = save_logs self.running_score = None self.demo_paths = demo_paths self.iter_count = 0.0 self.irl_model = irl_model self.irl_model_wt = irl_model_wt self.irl_batch_size = irl_batch_size self.train_irl = train_irl self.no_reward = no_reward self.entropy_weight = entropy_weight self.discrim_train_itrs = discrim_train_itrs self.global_status = dict() self.dump_paths = False self.default_lr = discr_lr self.lower_lr_on_main_loop_percentage = lower_lr_on_main_loop_percentage if isinstance(self.lower_lr_on_main_loop_percentage, list): self.lower_lr_on_main_loop_percentage = np.array( self.lower_lr_on_main_loop_percentage) if augmentation > 0: from inverse_rl_dexterous_hand.inverse_rl.augmentation import Augmentation self.augmentation = Augmentation(env, augment_times=augmentation) else: self.augmentation = None if save_logs: self.logger = DataLog()
def __init__(self, env, policy, baseline, learn_rate=0.01, seed=None, save_logs=False): self.env = env self.policy = policy self.baseline = baseline self.alpha = learn_rate self.seed = seed self.save_logs = save_logs self.running_score = None if save_logs: self.logger = DataLog()
def make_train_plots(log=None, log_path=None, keys=None, save_loc=None): if log is None and log_path is None: print("Need to provide either the log or path to a log file") if log is None: logger = DataLog() logger.read_log(log_path) log = logger.log # make plots for specified keys for key in keys: if key in log.keys(): plt.figure(figsize=(10, 6)) plt.plot(log['steps'], log[key]) plt.title(key) plt.savefig(save_loc + '/' + key + '.png', dpi=100) plt.close()
def __init__( self, env, policy, baseline, demo_paths=None, normalized_step_size=0.01, FIM_invert_args={ 'iters': 10, 'damping': 1e-4 }, hvp_sample_frac=1.0, seed=123, save_logs=False, kl_dist=None, lam_0=1.0, # demo coef lam_1=0.95, # decay coef entropy_weight=0, dump_paths=False, augmentation=False): self.env = env self.policy = policy self.baseline = baseline self.kl_dist = kl_dist if kl_dist is not None else 0.5 * normalized_step_size self.seed = seed self.save_logs = save_logs self.FIM_invert_args = FIM_invert_args self.hvp_subsample = hvp_sample_frac self.running_score = None self.demo_paths = demo_paths self.lam_0 = lam_0 self.lam_1 = lam_1 self.iter_count = 0.0 self.global_status = dict() self.entropy_weight = entropy_weight self.dump_paths = dump_paths if augmentation > 0: from mt_src.inverse_rl.augmentation import Augmentation self.augmentation = Augmentation(env, augment_times=augmentation) else: self.augmentation = None if self.dump_paths: from mt_src.inverse_rl.models.fusion_manager import DiskFusionDistr self.fusion = DiskFusionDistr(itr_offset=10000) if save_logs: self.logger = DataLog()
def __init__(self, env, policy, baseline, learn_rate=0.01, seed=123, desired_kl=None, save_logs=False, **kwargs ): self.env = env self.policy = policy self.baseline = baseline self.alpha = learn_rate self.seed = seed self.save_logs = save_logs self.running_score = None self.desired_kl = desired_kl if save_logs: self.logger = DataLog()
def __init__(self, expert_paths, policy, epochs=5, batch_size=64, lr=1e-3, optimizer=None): self.policy = policy self.expert_paths = expert_paths self.epochs = epochs self.mb_size = batch_size self.logger = DataLog() # get transformations observations = np.concatenate( [path["observations"] for path in expert_paths]) actions = np.concatenate([path["actions"] for path in expert_paths]) in_shift, in_scale = np.mean(observations, axis=0), np.std(observations, axis=0) out_shift, out_scale = np.mean(actions, axis=0), np.std(actions, axis=0) # set scalings in the target policy self.policy.model.set_transformations(in_shift, in_scale, out_shift, out_scale) self.policy.old_model.set_transformations(in_shift, in_scale, out_shift, out_scale) # set the variance of gaussian policy based on out_scale params = self.policy.get_param_values() params[-self.policy.m:] = np.log(out_scale + 1e-12) self.policy.set_param_values(params) # construct optimizer self.optimizer = torch.optim.Adam( self.policy.model.parameters(), lr=lr) if optimizer is None else optimizer # loss criterion is MSE for maximum likelihood estimation self.loss_function = torch.nn.MSELoss()
def __init__( self, expert_paths, policy, epochs=5, batch_size=64, lr=1e-3, optimizer=None, loss_type='MSE', # can be 'MLE' or 'MSE' save_logs=True, set_transforms=False, **kwargs, ): self.policy = policy self.expert_paths = expert_paths self.epochs = epochs self.mb_size = batch_size self.logger = DataLog() self.loss_type = loss_type self.save_logs = save_logs if set_transforms: in_shift, in_scale, out_shift, out_scale = self.compute_transformations( ) self.set_transformations(in_shift, in_scale, out_shift, out_scale) self.set_variance_with_data(out_scale) # construct optimizer self.optimizer = torch.optim.Adam( self.policy.trainable_params, lr=lr) if optimizer is None else optimizer # Loss criterion if required if loss_type == 'MSE': self.loss_criterion = torch.nn.MSELoss() # make logger if self.save_logs: self.logger = DataLog()
def __init__(self, env, policy, baseline, learn_rate=0.01, seed=123, desired_kl=None, save_logs=False, **kwargs): self.env = env self.policy = policy self.baseline = baseline self.alpha = learn_rate self.seed = seed self.save_logs = save_logs self.running_score = None self.desired_kl = desired_kl self.dump_paths = False self.augmentation = None self.direct_learning_augment_samples_count = 0 if save_logs: self.logger = DataLog()
def make_train_plots(log=None, log_path=None, keys=None, save_loc=None): if log is None and log_path is None: print("Need to provide either the log or path to a log file") if log is None: logger = DataLog() logger.read_log(log_path) log = logger.log # make plots for specified keys for key in keys: if key in log.keys(): plt.figure(figsize=(10, 6)) if 'samples' in keys: cum_samples = [ np.sum(log['num_samples'][:i]) for i in range(len(log['num_samples'])) ] plt.plot(cum_samples, log[key]) plt.xlabel('samples') else: plt.plot(log[key]) plt.xlabel('iterations') plt.title(key) plt.savefig(save_loc + '/' + key + '.png', dpi=100) plt.close()
def make_train_plots(log=None, log_path=None, keys=None, save_loc=None, sample_key='num_samples', x_scale=1.0, y_scale=1.0): if log is None and log_path is None: print("Need to provide either the log or path to a log file") if log is None: logger = DataLog() logger.read_log(log_path) log = logger.log # make plots for specified keys for key in keys: if key in log.keys(): fig = plt.figure(figsize=(10, 6)) ax1 = fig.add_subplot(111) try: cum_samples = [ np.sum(log[sample_key][:i]) * x_scale for i in range(len(log[sample_key])) ] ax1.plot(cum_samples, [elem * y_scale for elem in log[key]]) ax1.set_xlabel('samples') # mark iteration on the top axis ax2 = ax1.twiny() ax2.set_xlabel('iterations', color=(.7, .7, .7)) ax2.tick_params(axis='x', labelcolor=(.7, .7, .7)) ax2.set_xlim([0, len(log[key])]) except: ax1.plot(log[key]) ax1.set_xlabel('iterations') ax1.set_title(key) plt.savefig(save_loc + '/' + key + '.png', dpi=100) plt.close()
class PPO(BatchREINFORCE): def __init__(self, env, policy, baseline, clip_coef = 0.2, epochs = 10, mb_size = 64, learn_rate = 3e-4, seed = 0, save_logs = False): self.env = env self.policy = policy self.baseline = baseline self.learn_rate = learn_rate self.seed = seed self.save_logs = save_logs self.clip_coef = clip_coef self.epochs = epochs self.mb_size = mb_size self.running_score = None if save_logs: self.logger = DataLog() self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=learn_rate) def PPO_surrogate(self, observations, actions, advantages): adv_var = Variable(torch.from_numpy(advantages).float(), requires_grad=False) old_dist_info = self.policy.old_dist_info(observations, actions) new_dist_info = self.policy.new_dist_info(observations, actions) LR = self.policy.likelihood_ratio(new_dist_info, old_dist_info) LR_clip = torch.clamp(LR, min=1-self.clip_coef, max=1+self.clip_coef) ppo_surr = torch.mean(torch.min(LR*adv_var,LR_clip*adv_var)) return ppo_surr # ---------------------------------------------------------- def train_from_paths(self, paths): # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) # Advantage whitening advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) # NOTE : advantage should be zero mean in expectation # normalized step size invariant to advantage scaling, # but scaling can help with least squares # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] self.running_score = mean_return if self.running_score is None else \ 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters if self.save_logs: self.log_rollout_statistics(paths) # Optimization algorithm # -------------------------- surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] params_before_opt = self.policy.get_param_values() ts = timer.time() num_samples = observations.shape[0] for ep in range(self.epochs): for mb in range(int(num_samples / self.mb_size)): rand_idx = np.random.choice(num_samples, size=self.mb_size) obs = observations[rand_idx] act = actions[rand_idx] adv = advantages[rand_idx] self.optimizer.zero_grad() loss = - self.PPO_surrogate(obs, act, adv) loss.backward() self.optimizer.step() params_after_opt = self.policy.get_param_values() surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] self.policy.set_param_values(params_after_opt, set_new=True, set_old=True) t_opt = timer.time() - ts # Log information if self.save_logs: self.logger.log_kv('t_opt', t_opt) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) try: self.env.env.env.evaluate_success(paths, self.logger) except: # nested logic for backwards compatibility. TODO: clean this up. try: success_rate = self.env.env.env.evaluate_success(paths) self.logger.log_kv('success_rate', success_rate) except: pass return base_stats
class IRL(base): def __init__(self, env, policy, baseline, demo_paths=None, seed=123, save_logs=False, irl_model_wt=1.0, irl_batch_size=128, train_irl=True, irl_model=None, no_reward=True, discrim_train_itrs=20, entropy_weight=0.1, augmentation=False, lower_lr_on_main_loop_percentage=None, discr_lr=1e-3, call_super=False, **kwargs): super().__init__(env=env, policy=policy, baseline=baseline, demo_paths=demo_paths, save_logs=save_logs, **kwargs) self.env = env self.policy = policy self.baseline = baseline self.seed = seed self.save_logs = save_logs self.running_score = None self.demo_paths = demo_paths self.iter_count = 0.0 self.irl_model = irl_model self.irl_model_wt = irl_model_wt self.irl_batch_size = irl_batch_size self.train_irl = train_irl self.no_reward = no_reward self.entropy_weight = entropy_weight self.discrim_train_itrs = discrim_train_itrs self.global_status = dict() self.dump_paths = False self.default_lr = discr_lr self.lower_lr_on_main_loop_percentage = lower_lr_on_main_loop_percentage if isinstance(self.lower_lr_on_main_loop_percentage, list): self.lower_lr_on_main_loop_percentage = np.array( self.lower_lr_on_main_loop_percentage) if augmentation > 0: from inverse_rl_dexterous_hand.inverse_rl.augmentation import Augmentation self.augmentation = Augmentation(env, augment_times=augmentation) else: self.augmentation = None if save_logs: self.logger = DataLog() @property def checkpoint(self): save_checkpoint_funct = getattr(self.irl_model, "save_checkpoint", None) if not save_checkpoint_funct: return [ self.policy, self.baseline, self.irl_model, self.global_status ] else: return [self.policy, self.baseline, self.global_status] def save_checkpoint(self, **kwargs): save_checkpoint_funct = getattr(self.irl_model, "save_checkpoint", None) if save_checkpoint_funct: save_checkpoint_funct(kwargs['path'], kwargs['iteration']) def load_checkpoint(self, checkpoint, **kwargs): load_checkpoint_funct = getattr(self.irl_model, "load_checkpoint", None) if load_checkpoint_funct: load_checkpoint_funct(kwargs['path']) self.policy, self.baseline, self.global_status = checkpoint else: self.policy, self.baseline, self.irl_model, self.global_status = checkpoint def eval_irl(self, paths, training_paths_from_policy=True): if self.no_reward: tot_rew = 0 for path in paths: tot_rew += np.sum(path['rewards']) path['rewards'] *= 0 if training_paths_from_policy: self.logger.log_kv('OriginalTaskAverageReturn', tot_rew / float(len(paths))) if self.irl_model_wt <= 0: return paths probs = self.irl_model.eval(paths) probs_flat = np.concatenate(probs) # trajectory length varies if self.train_irl and training_paths_from_policy: self.logger.log_kv('IRLRewardMean', np.mean(probs_flat)) self.logger.log_kv('IRLRewardMax', np.max(probs_flat)) self.logger.log_kv('IRLRewardMin', np.min(probs_flat)) if self.irl_model.score_trajectories: # TODO: should I add to reward here or after advantage computation? by Justin Fu for i, path in enumerate(paths): path['rewards'][-1] += self.irl_model_wt * probs[i] else: for i, path in enumerate(paths): path['rewards'] += self.irl_model_wt * probs[i] return paths def fit_irl(self, paths, main_loop_step, main_loop_percentage, num_cpu, policy_updates_count): if self.irl_model_wt <= 0 or not self.train_irl: return if self.no_reward: tot_rew = 0 for path in paths: tot_rew += np.sum(path['rewards']) path['rewards'] *= 0 lr = self.default_lr if self.lower_lr_on_main_loop_percentage is not None: elements_lower_than_thresholds = ( self.lower_lr_on_main_loop_percentage < main_loop_percentage).sum() lr *= 0.1**elements_lower_than_thresholds mean_loss = self.irl_model.fit( paths, policy=self.policy, logger=self.logger, batch_size=self.irl_batch_size, max_itrs=self.discrim_train_itrs, lr=lr, num_cpu=num_cpu, policy_updates_count=policy_updates_count, main_loop_step=main_loop_step, main_loop_percentage=main_loop_percentage) self.logger.log_kv('IRLLoss', mean_loss)
class BatchREINFORCE: def __init__(self, env, policy, baseline, learn_rate=0.01, seed=None, save_logs=False): self.env = env self.policy = policy self.baseline = baseline self.alpha = learn_rate self.seed = seed self.save_logs = save_logs self.running_score = None if save_logs: self.logger = DataLog() def CPI_surrogate(self, observations, actions, advantages): adv_var = Variable(torch.from_numpy(advantages).float(), requires_grad=False) old_dist_info = self.policy.old_dist_info(observations, actions) new_dist_info = self.policy.new_dist_info(observations, actions) LR = self.policy.likelihood_ratio(new_dist_info, old_dist_info) surr = torch.mean(LR*adv_var) return surr def kl_old_new(self, observations, actions): old_dist_info = self.policy.old_dist_info(observations, actions) new_dist_info = self.policy.new_dist_info(observations, actions) mean_kl = self.policy.mean_kl(new_dist_info, old_dist_info) return mean_kl def flat_vpg(self, observations, actions, advantages): cpi_surr = self.CPI_surrogate(observations, actions, advantages) vpg_grad = torch.autograd.grad(cpi_surr, self.policy.trainable_params) vpg_grad = np.concatenate([g.contiguous().view(-1).data.numpy() for g in vpg_grad]) return vpg_grad # ---------------------------------------------------------- def train_step(self, N, sample_mode='trajectories', env_name=None, T=1e6, gamma=0.995, gae_lambda=0.98, num_cpu='max'): # Clean up input arguments if env_name is None: env_name = self.env.env_id if sample_mode != 'trajectories' and sample_mode != 'samples': print("sample_mode in NPG must be either 'trajectories' or 'samples'") quit() ts = timer.time() if sample_mode == 'trajectories': paths = trajectory_sampler.sample_paths_parallel(N, self.policy, T, env_name, self.seed, num_cpu) elif sample_mode == 'samples': paths = batch_sampler.sample_paths(N, self.policy, T, env_name=env_name, pegasus_seed=self.seed, num_cpu=num_cpu) if self.save_logs: self.logger.log_kv('time_sampling', timer.time() - ts) self.seed = self.seed + N if self.seed is not None else self.seed # compute returns process_samples.compute_returns(paths, gamma) # compute advantages process_samples.compute_advantages(paths, self.baseline, gamma, gae_lambda) # train from paths eval_statistics = self.train_from_paths(paths) eval_statistics.append(N) # fit baseline if self.save_logs: ts = timer.time() error_before, error_after = self.baseline.fit(paths, return_errors=True) self.logger.log_kv('time_VF', timer.time()-ts) self.logger.log_kv('VF_error_before', error_before) self.logger.log_kv('VF_error_after', error_after) else: self.baseline.fit(paths) return eval_statistics # ---------------------------------------------------------- def train_from_paths(self, paths): # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) # Advantage whitening advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] self.running_score = mean_return if self.running_score is None else \ 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 # Optimization algorithm # -------------------------- surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] # VPG ts = timer.time() vpg_grad = self.flat_vpg(observations, actions, advantages) t_gLL += timer.time() - ts # Policy update # -------------------------- curr_params = self.policy.get_param_values() new_params = curr_params + self.alpha * vpg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] self.policy.set_param_values(new_params, set_new=True, set_old=True) # Log information if self.save_logs: self.logger.log_kv('alpha', self.alpha) self.logger.log_kv('time_vpg', t_gLL) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) return base_stats def log_rollout_statistics(self, paths): path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) self.logger.log_kv('stoc_pol_mean', mean_return) self.logger.log_kv('stoc_pol_std', std_return) self.logger.log_kv('stoc_pol_max', max_return) self.logger.log_kv('stoc_pol_min', min_return)
class BC: def __init__(self, expert_paths, policy, epochs=5, batch_size=64, lr=1e-3, optimizer=None): self.policy = policy self.expert_paths = expert_paths self.epochs = epochs self.mb_size = batch_size self.logger = DataLog() # get transformations observations = np.concatenate( [path["observations"] for path in expert_paths]) actions = np.concatenate([path["actions"] for path in expert_paths]) in_shift, in_scale = np.mean(observations, axis=0), np.std(observations, axis=0) out_shift, out_scale = np.mean(actions, axis=0), np.std(actions, axis=0) # set scalings in the target policy self.policy.model.set_transformations(in_shift, in_scale, out_shift, out_scale) self.policy.old_model.set_transformations(in_shift, in_scale, out_shift, out_scale) # construct optimizer self.optimizer = torch.optim.Adam( self.policy.trainable_params, lr=lr) if optimizer is None else optimizer def loss(self, obs, act): LL, mu, log_std = self.policy.new_dist_info(obs, act) # minimize negative log likelihood return -torch.mean(LL) def train(self): observations = np.concatenate( [path["observations"] for path in self.expert_paths]) actions = np.concatenate( [path["actions"] for path in self.expert_paths]) params_before_opt = self.policy.get_param_values() ts = timer.time() num_samples = observations.shape[0] for ep in tqdm(range(self.epochs)): self.logger.log_kv('epoch', ep) loss_val = self.loss(observations, actions).data.numpy().ravel()[0] self.logger.log_kv('loss', loss_val) self.logger.log_kv('time', (timer.time() - ts)) for mb in range(int(num_samples / self.mb_size)): rand_idx = np.random.choice(num_samples, size=self.mb_size) obs = observations[rand_idx] act = actions[rand_idx] self.optimizer.zero_grad() loss = self.loss(obs, act) loss.backward() self.optimizer.step() params_after_opt = self.policy.get_param_values() self.policy.set_param_values(params_after_opt, set_new=True, set_old=True) self.logger.log_kv('epoch', self.epochs) loss_val = self.loss(observations, actions).data.numpy().ravel()[0] self.logger.log_kv('loss', loss_val) self.logger.log_kv('time', (timer.time() - ts))
class DAPG(NPG): def __init__( self, env, policy, baseline, demo_paths=None, normalized_step_size=0.01, FIM_invert_args={ 'iters': 10, 'damping': 1e-4 }, hvp_sample_frac=1.0, seed=123, save_logs=False, kl_dist=None, lam_0=1.0, # demo coef lam_1=0.95, # decay coef entropy_weight=0, dump_paths=False, augmentation=False): self.env = env self.policy = policy self.baseline = baseline self.kl_dist = kl_dist if kl_dist is not None else 0.5 * normalized_step_size self.seed = seed self.save_logs = save_logs self.FIM_invert_args = FIM_invert_args self.hvp_subsample = hvp_sample_frac self.running_score = None self.demo_paths = demo_paths self.lam_0 = lam_0 self.lam_1 = lam_1 self.iter_count = 0.0 self.global_status = dict() self.entropy_weight = entropy_weight self.dump_paths = dump_paths if augmentation > 0: from mt_src.inverse_rl.augmentation import Augmentation self.augmentation = Augmentation(env, augment_times=augmentation) else: self.augmentation = None if self.dump_paths: from mt_src.inverse_rl.models.fusion_manager import DiskFusionDistr self.fusion = DiskFusionDistr(itr_offset=10000) if save_logs: self.logger = DataLog() def train_from_paths(self, paths): # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) if self.demo_paths is not None and self.lam_0 > 0.0: demo_obs = np.concatenate( [path["observations"] for path in self.demo_paths]) demo_act = np.concatenate( [path["actions"] for path in self.demo_paths]) demo_adv = self.lam_0 * (self.lam_1**self.iter_count) * np.ones( demo_obs.shape[0]) self.iter_count += 1 # concatenate all all_obs = np.concatenate([observations, demo_obs]) all_act = np.concatenate([actions, demo_act]) all_adv = 1e-2 * np.concatenate( [advantages / (np.std(advantages) + 1e-8), demo_adv]) else: all_obs = observations all_act = actions all_adv = advantages entropy = np.sum( self.policy.log_std_val + np.log(np.sqrt(2 * np.pi * np.e))) # taken from inverse_rl repo if self.save_logs: self.logger.log_kv('entropy', entropy) if self.entropy_weight > 0: all_adv = all_adv + self.entropy_weight * entropy # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] self.running_score = mean_return if self.running_score is None else \ 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 t_FIM = 0.0 # Optimization algorithm # -------------------------- surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] # DAPG ts = timer.time() sample_coef = all_adv.shape[0] / advantages.shape[0] dapg_grad = sample_coef * self.flat_vpg(all_obs, all_act, all_adv) t_gLL += timer.time() - ts # NPG ts = timer.time() hvp = self.build_Hvp_eval([observations, actions], regu_coef=self.FIM_invert_args['damping']) npg_grad = cg_solve(hvp, dapg_grad, x_0=dapg_grad.copy(), cg_iters=self.FIM_invert_args['iters']) t_FIM += timer.time() - ts # Step size computation # -------------------------- n_step_size = 2.0 * self.kl_dist alpha = np.sqrt( np.abs(n_step_size / (np.dot(dapg_grad.T, npg_grad) + 1e-20))) # Policy update # -------------------------- curr_params = self.policy.get_param_values() new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] self.policy.set_param_values(new_params, set_new=True, set_old=True) # Log information if self.save_logs: self.logger.log_kv('alpha', alpha) self.logger.log_kv('delta', n_step_size) self.logger.log_kv('time_vpg', t_gLL) self.logger.log_kv('time_npg', t_FIM) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) try: self.env.env.env.evaluate_success(paths, self.logger) except: # nested logic for backwards compatibility. TODO: clean this up. try: success_rate = self.env.env.env.evaluate_success(paths) self.logger.log_kv('success_rate', success_rate) except: pass return base_stats @property def checkpoint(self): return [self.policy, self.baseline, self.global_status] def load_checkpoint(self, checkpoint, **kwargs): self.policy, self.baseline, self.global_status = checkpoint
class BatchREINFORCE: def __init__(self, env, policy, baseline, learn_rate=0.01, seed=123, desired_kl=None, save_logs=False, **kwargs ): self.env = env self.policy = policy self.baseline = baseline self.alpha = learn_rate self.seed = seed self.save_logs = save_logs self.running_score = None self.desired_kl = desired_kl if save_logs: self.logger = DataLog() def CPI_surrogate(self, observations, actions, advantages): adv_var = Variable(torch.from_numpy(advantages).float(), requires_grad=False) old_dist_info = self.policy.old_dist_info(observations, actions) new_dist_info = self.policy.new_dist_info(observations, actions) LR = self.policy.likelihood_ratio(new_dist_info, old_dist_info) surr = torch.mean(LR*adv_var) return surr def kl_old_new(self, observations, actions): old_dist_info = self.policy.old_dist_info(observations, actions) new_dist_info = self.policy.new_dist_info(observations, actions) mean_kl = self.policy.mean_kl(new_dist_info, old_dist_info) return mean_kl def flat_vpg(self, observations, actions, advantages): cpi_surr = self.CPI_surrogate(observations, actions, advantages) vpg_grad = torch.autograd.grad(cpi_surr, self.policy.trainable_params) vpg_grad = np.concatenate([g.contiguous().view(-1).data.numpy() for g in vpg_grad]) return vpg_grad # ---------------------------------------------------------- def train_step(self, N, env=None, sample_mode='trajectories', horizon=1e6, gamma=0.995, gae_lambda=0.97, num_cpu='max', env_kwargs=None, ): # Clean up input arguments env = self.env.env_id if env is None else env if sample_mode != 'trajectories' and sample_mode != 'samples': print("sample_mode in NPG must be either 'trajectories' or 'samples'") quit() ts = timer.time() if sample_mode == 'trajectories': input_dict = dict(num_traj=N, env=env, policy=self.policy, horizon=horizon, base_seed=self.seed, num_cpu=num_cpu, env_kwargs=env_kwargs) paths = trajectory_sampler.sample_paths(**input_dict) elif sample_mode == 'samples': input_dict = dict(num_samples=N, env=env, policy=self.policy, horizon=horizon, base_seed=self.seed, num_cpu=num_cpu, env_kwargs=env_kwargs) paths = trajectory_sampler.sample_data_batch(**input_dict) if self.save_logs: self.logger.log_kv('time_sampling', timer.time() - ts) self.seed = self.seed + N if self.seed is not None else self.seed # compute returns process_samples.compute_returns(paths, gamma) # compute advantages process_samples.compute_advantages(paths, self.baseline, gamma, gae_lambda) # train from paths eval_statistics = self.train_from_paths(paths) eval_statistics.append(N) # log number of samples if self.save_logs: num_samples = np.sum([p["rewards"].shape[0] for p in paths]) self.logger.log_kv('num_samples', num_samples) # fit baseline if self.save_logs: ts = timer.time() error_before, error_after = self.baseline.fit(paths, return_errors=True) self.logger.log_kv('time_VF', timer.time()-ts) self.logger.log_kv('VF_error_before', error_before) self.logger.log_kv('VF_error_after', error_after) else: self.baseline.fit(paths) return eval_statistics # ---------------------------------------------------------- def train_from_paths(self, paths): observations, actions, advantages, base_stats, self.running_score = self.process_paths(paths) if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 # Optimization algorithm # -------------------------- surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] # VPG ts = timer.time() vpg_grad = self.flat_vpg(observations, actions, advantages) t_gLL += timer.time() - ts # Policy update with linesearch # ------------------------------ if self.desired_kl is not None: max_ctr = 100 alpha = self.alpha curr_params = self.policy.get_param_values() for ctr in range(max_ctr): new_params = curr_params + alpha * vpg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] if kl_dist <= self.desired_kl: break else: print("backtracking") alpha = alpha / 2.0 else: curr_params = self.policy.get_param_values() new_params = curr_params + self.alpha * vpg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] self.policy.set_param_values(new_params, set_new=True, set_old=True) # Log information if self.save_logs: self.logger.log_kv('alpha', self.alpha) self.logger.log_kv('time_vpg', t_gLL) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) try: self.env.env.env.evaluate_success(paths, self.logger) except: # nested logic for backwards compatibility. TODO: clean this up. try: success_rate = self.env.env.env.evaluate_success(paths) self.logger.log_kv('success_rate', success_rate) except: pass return base_stats def process_paths(self, paths): # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) # Advantage whitening advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] running_score = mean_return if self.running_score is None else \ 0.9 * self.running_score + 0.1 * mean_return return observations, actions, advantages, base_stats, running_score def log_rollout_statistics(self, paths): path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) self.logger.log_kv('stoc_pol_mean', mean_return) self.logger.log_kv('stoc_pol_std', std_return) self.logger.log_kv('stoc_pol_max', max_return) self.logger.log_kv('stoc_pol_min', min_return)
class BC: def __init__(self, expert_paths, policy, epochs=5, batch_size=64, lr=1e-3, optimizer=None): self.policy = policy self.expert_paths = expert_paths self.epochs = epochs self.mb_size = batch_size self.logger = DataLog() # get transformations observations = np.concatenate( [path["observations"] for path in expert_paths]) actions = np.concatenate([path["actions"] for path in expert_paths]) in_shift, in_scale = np.mean(observations, axis=0), np.std(observations, axis=0) out_shift, out_scale = np.mean(actions, axis=0), np.std(actions, axis=0) # set scalings in the target policy self.policy.model.set_transformations(in_shift, in_scale, out_shift, out_scale) self.policy.old_model.set_transformations(in_shift, in_scale, out_shift, out_scale) # set the variance of gaussian policy based on out_scale params = self.policy.get_param_values() params[-self.policy.m:] = np.log(out_scale + 1e-12) self.policy.set_param_values(params) # construct optimizer self.optimizer = torch.optim.Adam( self.policy.model.parameters(), lr=lr) if optimizer is None else optimizer # loss criterion is MSE for maximum likelihood estimation self.loss_function = torch.nn.MSELoss() def loss(self, obs, act): obs_var = Variable(torch.from_numpy(obs).float(), requires_grad=False) act_var = Variable(torch.from_numpy(act).float(), requires_grad=False) act_hat = self.policy.model(obs_var) return self.loss_function(act_hat, act_var.detach()) def train(self): observations = np.concatenate( [path["observations"] for path in self.expert_paths]) actions = np.concatenate( [path["actions"] for path in self.expert_paths]) params_before_opt = self.policy.get_param_values() ts = timer.time() num_samples = observations.shape[0] for ep in tqdm(range(self.epochs)): self.logger.log_kv('epoch', ep) loss_val = self.loss(observations, actions).data.numpy().ravel()[0] self.logger.log_kv('loss', loss_val) self.logger.log_kv('time', (timer.time() - ts)) for mb in range(int(num_samples / self.mb_size)): rand_idx = np.random.choice(num_samples, size=self.mb_size) obs = observations[rand_idx] act = actions[rand_idx] self.optimizer.zero_grad() loss = self.loss(obs, act) loss.backward() self.optimizer.step() params_after_opt = self.policy.get_param_values() self.policy.set_param_values(params_after_opt, set_new=True, set_old=True) self.logger.log_kv('epoch', self.epochs) loss_val = self.loss(observations, actions).data.numpy().ravel()[0] self.logger.log_kv('loss', loss_val) self.logger.log_kv('time', (timer.time() - ts))
'-i', type=str, required=False, help='package to import') args = parser.parse_args() OUT_DIR = args.output if not os.path.exists(OUT_DIR): os.mkdir(OUT_DIR) if not os.path.exists(OUT_DIR + '/iterations'): os.mkdir(OUT_DIR + '/iterations') if not os.path.exists(OUT_DIR + '/logs'): os.mkdir(OUT_DIR + '/logs') with open(args.config, 'r') as f: job_data = eval(f.read()) if args.include: exec("import " + args.include) # Unpack args and make files for easy access logger = DataLog() ENV_NAME = job_data['env_name'] EXP_FILE = OUT_DIR + '/job_data.json' SEED = job_data['seed'] # base cases if 'eval_rollouts' not in job_data.keys(): job_data['eval_rollouts'] = 0 if 'save_freq' not in job_data.keys(): job_data['save_freq'] = 10 if 'device' not in job_data.keys(): job_data['device'] = 'cpu' if 'hvp_frac' not in job_data.keys(): job_data['hvp_frac'] = 1.0 if 'start_state' not in job_data.keys(): job_data['start_state'] = 'init' if 'learn_reward' not in job_data.keys(): job_data['learn_reward'] = True if 'num_cpu' not in job_data.keys(): job_data['num_cpu'] = 1 if 'npg_hp' not in job_data.keys(): job_data['npg_hp'] = dict() if 'act_repeat' not in job_data.keys(): job_data['act_repeat'] = 1
class BC: def __init__( self, expert_paths, policy, epochs=5, batch_size=64, lr=1e-3, optimizer=None, loss_type='MSE', # can be 'MLE' or 'MSE' save_logs=True, set_transforms=False, **kwargs, ): self.policy = policy self.expert_paths = expert_paths self.epochs = epochs self.mb_size = batch_size self.logger = DataLog() self.loss_type = loss_type self.save_logs = save_logs if set_transforms: in_shift, in_scale, out_shift, out_scale = self.compute_transformations( ) self.set_transformations(in_shift, in_scale, out_shift, out_scale) self.set_variance_with_data(out_scale) # construct optimizer self.optimizer = torch.optim.Adam( self.policy.trainable_params, lr=lr) if optimizer is None else optimizer # Loss criterion if required if loss_type == 'MSE': self.loss_criterion = torch.nn.MSELoss() # make logger if self.save_logs: self.logger = DataLog() def compute_transformations(self): # get transformations if self.expert_paths == [] or self.expert_paths is None: in_shift, in_scale, out_shift, out_scale = None, None, None, None else: observations = np.concatenate( [path["observations"] for path in self.expert_paths]) actions = np.concatenate( [path["actions"] for path in self.expert_paths]) in_shift, in_scale = np.mean(observations, axis=0), np.std(observations, axis=0) out_shift, out_scale = np.mean(actions, axis=0), np.std(actions, axis=0) return in_shift, in_scale, out_shift, out_scale def set_transformations(self, in_shift=None, in_scale=None, out_shift=None, out_scale=None): # set scalings in the target policy self.policy.model.set_transformations(in_shift, in_scale, out_shift, out_scale) self.policy.old_model.set_transformations(in_shift, in_scale, out_shift, out_scale) def set_variance_with_data(self, out_scale): # set the variance of gaussian policy based on out_scale params = self.policy.get_param_values() params[-self.policy.m:] = np.log(out_scale + 1e-12) self.policy.set_param_values(params) def loss(self, data, idx=None): if self.loss_type == 'MLE': return self.mle_loss(data, idx) elif self.loss_type == 'MSE': return self.mse_loss(data, idx) else: print("Please use valid loss type") return None def mle_loss(self, data, idx): # use indices if provided (e.g. for mini-batching) # otherwise, use all the data idx = range(data['observations'].shape[0]) if idx is None else idx if type(data['observations']) == torch.Tensor: idx = torch.LongTensor(idx) obs = data['observations'][idx] act = data['expert_actions'][idx] LL, mu, log_std = self.policy.new_dist_info(obs, act) # minimize negative log likelihood return -torch.mean(LL) def mse_loss(self, data, idx=None): idx = range(data['observations'].shape[0]) if idx is None else idx if type(data['observations']) is torch.Tensor: idx = torch.LongTensor(idx) obs = data['observations'][idx] act_expert = data['expert_actions'][idx] if type(data['observations']) is not torch.Tensor: obs = Variable(torch.from_numpy(obs).float(), requires_grad=False) act_expert = Variable(torch.from_numpy(act_expert).float(), requires_grad=False) act_pi = self.policy.model(obs) return self.loss_criterion(act_pi, act_expert.detach()) def fit(self, data, suppress_fit_tqdm=False, **kwargs): # data is a dict # keys should have "observations" and "expert_actions" validate_keys = all( [k in data.keys() for k in ["observations", "expert_actions"]]) assert validate_keys is True ts = timer.time() num_samples = data["observations"].shape[0] # log stats before if self.save_logs: loss_val = self.loss( data, idx=range(num_samples)).data.numpy().ravel()[0] self.logger.log_kv('loss_before', loss_val) # train loop for ep in config_tqdm(range(self.epochs), suppress_fit_tqdm): for mb in range(int(num_samples / self.mb_size)): rand_idx = np.random.choice(num_samples, size=self.mb_size) self.optimizer.zero_grad() loss = self.loss(data, idx=rand_idx) loss.backward() self.optimizer.step() params_after_opt = self.policy.get_param_values() self.policy.set_param_values(params_after_opt, set_new=True, set_old=True) # log stats after if self.save_logs: self.logger.log_kv('epoch', self.epochs) loss_val = self.loss( data, idx=range(num_samples)).data.numpy().ravel()[0] self.logger.log_kv('loss_after', loss_val) self.logger.log_kv('time', (timer.time() - ts)) def train(self, **kwargs): observations = np.concatenate( [path["observations"] for path in self.expert_paths]) expert_actions = np.concatenate( [path["actions"] for path in self.expert_paths]) data = dict(observations=observations, expert_actions=expert_actions) self.fit(data, **kwargs)
class NPG(BatchREINFORCE): def __init__(self, env, policy, baseline, optim, normalized_step_size=0.01, const_learn_rate=None, FIM_invert_args={ 'iters': 10, 'damping': 1e-4 }, hvp_sample_frac=1.0, seed=None, save_logs=False, kl_dist=None): """ All inputs are expected in mjrl's format unless specified :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance :param kl_dist: desired KL distance between steps. Overrides normalized_step_size. :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well) :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow) :param seed: random seed """ self.env = env self.policy = policy self.baseline = baseline self.optim = optim self.alpha = const_learn_rate self.n_step_size = normalized_step_size if kl_dist is None else 2.0 * kl_dist self.seed = seed self.save_logs = save_logs self.FIM_invert_args = FIM_invert_args self.hvp_subsample = hvp_sample_frac self.running_score = None self.n_steps = 0 if save_logs: self.logger = DataLog() def policy_kl_fn(self, policy, obs, act): old_dist_info = policy.old_dist_info(obs, act) new_dist_info = policy.new_dist_info(obs, act) mean_kl = policy.mean_kl(new_dist_info, old_dist_info) return mean_kl def kl_closure(self, policy, observations, actions, kl_fn): def func(params): old_params = policy.get_param_values() params = parameters_to_vector(params).data.numpy() policy.set_param_values(params, set_new=True, set_old=True) f = kl_fn(policy, observations, actions) tmp_params = policy.trainable_params policy.set_param_values(old_params, set_new=True, set_old=True) return f, tmp_params return func def HVP(self, policy, observations, actions, vec, regu_coef=None): regu_coef = self.FIM_invert_args[ 'damping'] if regu_coef is None else regu_coef # vec = Variable(torch.from_numpy(vector).float(), requires_grad=False) if self.hvp_subsample is not None and self.hvp_subsample < 0.99: num_samples = observations.shape[0] rand_idx = np.random.choice(num_samples, size=int(self.hvp_subsample * num_samples)) obs = observations[rand_idx] act = actions[rand_idx] else: obs = observations act = actions old_dist_info = policy.old_dist_info(obs, act) new_dist_info = policy.new_dist_info(obs, act) mean_kl = policy.mean_kl(new_dist_info, old_dist_info) grad_fo = torch.autograd.grad(mean_kl, policy.trainable_params, create_graph=True) flat_grad = torch.cat([g.contiguous().view(-1) for g in grad_fo]) h = torch.sum(flat_grad * vec) hvp = torch.autograd.grad(h, policy.trainable_params) hvp_flat = torch.cat([g.contiguous().view(-1).data for g in hvp]) # hvp_flat = np.concatenate([g.contiguous().view(-1).data.numpy() for g in hvp]) hvp_res = hvp_flat + regu_coef * vec return hvp_res def build_Hvp_eval(self, policy, inputs, regu_coef=None): def eval(theta, v): policy_tmp = copy.deepcopy(policy) policy_tmp.set_param_values(theta.data.numpy()) full_inp = [policy_tmp] + inputs + [v] + [regu_coef] Hvp = self.HVP(*full_inp) return Hvp return eval # ---------------------------------------------------------- def train_from_paths(self, paths): # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) # Advantage whitening advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) # NOTE : advantage should be zero mean in expectation # normalized step size invariant to advantage scaling, # but scaling can help with least squares self.n_steps += len(advantages) # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] self.running_score = mean_return if self.running_score is None else \ 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 t_FIM = 0.0 self.optim.zero_grad() # Optimization. Negate gradient since the optimizer is minimizing. vpg_grad = -self.flat_vpg(observations, actions, advantages) vector_to_gradients(Variable(torch.from_numpy(vpg_grad).float()), self.policy.trainable_params) closure = self.kl_closure(self.policy, observations, actions, self.policy_kl_fn) info = self.optim.step(closure) self.policy.set_param_values(self.policy.get_param_values()) # Log information if self.save_logs: self.logger.log_kv('alpha', info['alpha']) self.logger.log_kv('delta', info['delta']) # self.logger.log_kv('time_vpg', t_gLL) # self.logger.log_kv('time_npg', t_FIM) # self.logger.log_kv('kl_dist', kl_dist) # self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) self.logger.log_kv('steps', self.n_steps) try: success_rate = self.env.env.env.evaluate_success(paths) self.logger.log_kv('success_rate', success_rate) except: pass return base_stats
class TRPO(NPG): def __init__(self, env, policy, baseline, kl_dist=0.01, FIM_invert_args={'iters': 10, 'damping': 1e-4}, hvp_sample_frac=1.0, seed=123, save_logs=False, normalized_step_size=0.01, **kwargs ): """ All inputs are expected in mjrl's format unless specified :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance :param kl_dist: desired KL distance between steps. Overrides normalized_step_size. :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well) :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow) :param seed: random seed """ self.env = env self.policy = policy self.baseline = baseline self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size self.seed = seed self.save_logs = save_logs self.FIM_invert_args = FIM_invert_args self.hvp_subsample = hvp_sample_frac self.running_score = None if save_logs: self.logger = DataLog() def train_from_paths(self, paths): # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) # Advantage whitening advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) # NOTE : advantage should be zero mean in expectation # normalized step size invariant to advantage scaling, # but scaling can help with least squares # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] self.running_score = mean_return if self.running_score is None else \ 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 t_FIM = 0.0 # Optimization algorithm # -------------------------- surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] # VPG ts = timer.time() vpg_grad = self.flat_vpg(observations, actions, advantages) t_gLL += timer.time() - ts # NPG ts = timer.time() hvp = self.build_Hvp_eval([observations, actions], regu_coef=self.FIM_invert_args['damping']) npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(), cg_iters=self.FIM_invert_args['iters']) t_FIM += timer.time() - ts # Step size computation # -------------------------- n_step_size = 2.0*self.kl_dist alpha = np.sqrt(np.abs(n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20))) # Policy update # -------------------------- curr_params = self.policy.get_param_values() for k in range(100): new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] if kl_dist < self.kl_dist: break else: alpha = 0.9*alpha # backtrack print("Step size too high. Backtracking. | kl = %f | surr diff = %f" % \ (kl_dist, surr_after-surr_before) ) if k == 99: alpha = 0.0 new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] self.policy.set_param_values(new_params, set_new=True, set_old=True) # Log information if self.save_logs: self.logger.log_kv('alpha', alpha) self.logger.log_kv('delta', n_step_size) self.logger.log_kv('time_vpg', t_gLL) self.logger.log_kv('time_npg', t_FIM) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) try: self.env.env.env.evaluate_success(paths, self.logger) except: # nested logic for backwards compatibility. TODO: clean this up. try: success_rate = self.env.env.env.evaluate_success(paths) self.logger.log_kv('success_rate', success_rate) except: pass return base_stats
class DAPG(NPG): def __init__( self, env, policy, baseline, demo_paths=None, normalized_step_size=0.01, FIM_invert_args={ 'iters': 10, 'damping': 1e-4 }, hvp_sample_frac=1.0, seed=None, save_logs=False, kl_dist=None, lam_0=1.0, # demo coef lam_1=0.95, # decay coef ): self.env = env self.policy = policy self.baseline = baseline self.kl_dist = kl_dist if kl_dist is not None else 0.5 * normalized_step_size self.seed = seed self.save_logs = save_logs self.FIM_invert_args = FIM_invert_args self.hvp_subsample = hvp_sample_frac self.running_score = None self.demo_paths = demo_paths self.lam_0 = lam_0 self.lam_1 = lam_1 self.iter_count = 0.0 if save_logs: self.logger = DataLog() def train_from_paths(self, paths): # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) if self.demo_paths is not None and self.lam_0 > 0.0: demo_obs = np.concatenate( [path["observations"] for path in self.demo_paths]) demo_act = np.concatenate( [path["actions"] for path in self.demo_paths]) demo_adv = self.lam_0 * (self.lam_1**self.iter_count) * np.ones( demo_obs.shape[0]) # concatenate all all_obs = np.concatenate([observations, demo_obs]) all_act = np.concatenate([actions, demo_act]) all_adv = 1e-2 * np.concatenate( [advantages / (np.std(advantages) + 1e-8), demo_adv]) else: all_obs = observations all_act = actions all_adv = advantages # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] self.running_score = mean_return if self.running_score is None else \ 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 t_FIM = 0.0 # Optimization algorithm # -------------------------- surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] # DAPG ts = timer.time() sample_coef = all_adv.shape[0] / advantages.shape[0] dapg_grad = sample_coef * self.flat_vpg(all_obs, all_act, all_adv) t_gLL += timer.time() - ts # NPG ts = timer.time() hvp = self.build_Hvp_eval([observations, actions], regu_coef=self.FIM_invert_args['damping']) npg_grad = cg_solve(hvp, dapg_grad, x_0=dapg_grad.copy(), cg_iters=self.FIM_invert_args['iters']) t_FIM += timer.time() - ts # Step size computation # -------------------------- n_step_size = 2.0 * self.kl_dist alpha = np.sqrt( np.abs(n_step_size / (np.dot(dapg_grad.T, npg_grad) + 1e-20))) # Policy update # -------------------------- curr_params = self.policy.get_param_values() new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] self.policy.set_param_values(new_params, set_new=True, set_old=True) # Log information if self.save_logs: self.logger.log_kv('alpha', alpha) self.logger.log_kv('delta', n_step_size) self.logger.log_kv('time_vpg', t_gLL) self.logger.log_kv('time_npg', t_FIM) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) try: success_rate = self.env.env.env.evaluate_success(paths) self.logger.log_kv('success_rate', success_rate) except: pass return base_stats
class NPG(BatchREINFORCE): def __init__(self, env, policy, baseline, normalized_step_size=0.01, const_learn_rate=None, FIM_invert_args={ 'iters': 10, 'damping': 1e-4 }, hvp_sample_frac=1.0, seed=123, save_logs=False, kl_dist=None, input_normalization=None, **kwargs): """ All inputs are expected in mjrl's format unless specified :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance :param kl_dist: desired KL distance between steps. Overrides normalized_step_size. :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well) :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow) :param seed: random seed """ self.env = env self.policy = policy self.baseline = baseline self.alpha = const_learn_rate self.n_step_size = normalized_step_size if kl_dist is None else 2.0 * kl_dist self.seed = seed self.save_logs = save_logs self.FIM_invert_args = FIM_invert_args self.hvp_subsample = hvp_sample_frac self.running_score = None if save_logs: self.logger = DataLog() # input normalization (running average) self.input_normalization = input_normalization if self.input_normalization is not None: if self.input_normalization > 1 or self.input_normalization <= 0: self.input_normalization = None self.global_status = dict() def HVP(self, observations, actions, vector, regu_coef=None): regu_coef = self.FIM_invert_args[ 'damping'] if regu_coef is None else regu_coef vec = Variable(torch.from_numpy(vector).float(), requires_grad=False) if self.hvp_subsample is not None and self.hvp_subsample < 0.99: num_samples = observations.shape[0] rand_idx = np.random.choice(num_samples, size=int(self.hvp_subsample * num_samples)) obs = observations[rand_idx] act = actions[rand_idx] else: obs = observations act = actions old_dist_info = self.policy.old_dist_info(obs, act) new_dist_info = self.policy.new_dist_info(obs, act) mean_kl = self.policy.mean_kl(new_dist_info, old_dist_info) grad_fo = torch.autograd.grad(mean_kl, self.policy.trainable_params, create_graph=True) flat_grad = torch.cat([g.contiguous().view(-1) for g in grad_fo]) h = torch.sum(flat_grad * vec) hvp = torch.autograd.grad(h, self.policy.trainable_params) hvp_flat = np.concatenate( [g.contiguous().view(-1).data.numpy() for g in hvp]) return hvp_flat + regu_coef * vector def build_Hvp_eval(self, inputs, regu_coef=None): def eval(v): full_inp = inputs + [v] + [regu_coef] Hvp = self.HVP(*full_inp) return Hvp return eval # ---------------------------------------------------------- def train_from_paths(self, paths): observations, actions, advantages, base_stats, self.running_score = self.process_paths( paths) if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 t_FIM = 0.0 # normalize inputs if necessary if self.input_normalization: data_in_shift, data_in_scale = np.mean( observations, axis=0), np.std(observations, axis=0) pi_in_shift, pi_in_scale = self.policy.model.in_shift.data.numpy( ), self.policy.model.in_scale.data.numpy() pi_out_shift, pi_out_scale = self.policy.model.out_shift.data.numpy( ), self.policy.model.out_scale.data.numpy() pi_in_shift = self.input_normalization * pi_in_shift + ( 1 - self.input_normalization) * data_in_shift pi_in_scale = self.input_normalization * pi_in_scale + ( 1 - self.input_normalization) * data_in_scale self.policy.model.set_transformations(pi_in_shift, pi_in_scale, pi_out_shift, pi_out_scale) # Optimization algorithm # -------------------------- surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] # VPG ts = timer.time() vpg_grad = self.flat_vpg(observations, actions, advantages) t_gLL += timer.time() - ts # NPG ts = timer.time() hvp = self.build_Hvp_eval([observations, actions], regu_coef=self.FIM_invert_args['damping']) npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(), cg_iters=self.FIM_invert_args['iters']) t_FIM += timer.time() - ts # Step size computation # -------------------------- if self.alpha is not None: alpha = self.alpha n_step_size = (alpha**2) * np.dot(vpg_grad.T, npg_grad) else: n_step_size = self.n_step_size alpha = np.sqrt( np.abs(self.n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20))) # Policy update # -------------------------- curr_params = self.policy.get_param_values() new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] self.policy.set_param_values(new_params, set_new=True, set_old=True) # Log information if self.save_logs: self.logger.log_kv('alpha', alpha) self.logger.log_kv('delta', n_step_size) self.logger.log_kv('time_vpg', t_gLL) self.logger.log_kv('time_npg', t_FIM) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) try: self.env.env.env.evaluate_success(paths, self.logger) except: # nested logic for backwards compatibility. TODO: clean this up. try: success_rate = self.env.env.env.evaluate_success(paths) self.logger.log_kv('success_rate', success_rate) except: pass return base_stats @property def checkpoint(self): return [self.policy, self.baseline, self.global_status] def load_checkpoint(self, checkpoint, **kwargs): self.policy, self.baseline, self.global_status = checkpoint
class NPG(BatchREINFORCE): def __init__(self, env, policy, baseline, normalized_step_size=0.01, const_learn_rate=None, FIM_invert_args={'iters': 10, 'damping': 1e-4}, hvp_sample_frac=1.0, seed=None, save_logs=False, kl_dist=None): """ All inputs are expected in mjrl's format unless specified :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance :param kl_dist: desired KL distance between steps. Overrides normalized_step_size. :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well) :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow) :param seed: random seed """ self.env = env self.policy = policy self.baseline = baseline self.alpha = const_learn_rate self.n_step_size = normalized_step_size if kl_dist is None else 2.0 * kl_dist self.seed = seed self.save_logs = save_logs self.FIM_invert_args = FIM_invert_args self.hvp_subsample = hvp_sample_frac self.running_score = None if save_logs: self.logger = DataLog() def HVP(self, observations, actions, vector, regu_coef=None): regu_coef = self.FIM_invert_args['damping'] if regu_coef is None else regu_coef vec = Variable(torch.from_numpy(vector).float(), requires_grad=False) if self.hvp_subsample is not None and self.hvp_subsample < 0.99: num_samples = observations.shape[0] rand_idx = np.random.choice(num_samples, size=int(self.hvp_subsample*num_samples)) obs = observations[rand_idx] act = actions[rand_idx] else: obs = observations act = actions old_dist_info = self.policy.old_dist_info(obs, act) new_dist_info = self.policy.new_dist_info(obs, act) mean_kl = self.policy.mean_kl(new_dist_info, old_dist_info) grad_fo = torch.autograd.grad(mean_kl, self.policy.trainable_params, create_graph=True) flat_grad = torch.cat([g.contiguous().view(-1) for g in grad_fo]) h = torch.sum(flat_grad*vec) hvp = torch.autograd.grad(h, self.policy.trainable_params) hvp_flat = np.concatenate([g.contiguous().view(-1).data.numpy() for g in hvp]) return hvp_flat + regu_coef*vector def build_Hvp_eval(self, inputs, regu_coef=None): def eval(v): full_inp = inputs + [v] + [regu_coef] Hvp = self.HVP(*full_inp) return Hvp return eval # ---------------------------------------------------------- def train_from_paths(self, paths): # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) # Advantage whitening advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) # NOTE : advantage should be zero mean in expectation # normalized step size invariant to advantage scaling, # but scaling can help with least squares # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] self.running_score = mean_return if self.running_score is None else \ 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 t_FIM = 0.0 # Optimization algorithm # -------------------------- surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] # VPG ts = timer.time() vpg_grad = self.flat_vpg(observations, actions, advantages) t_gLL += timer.time() - ts # NPG ts = timer.time() hvp = self.build_Hvp_eval([observations, actions], regu_coef=self.FIM_invert_args['damping']) npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(), cg_iters=self.FIM_invert_args['iters']) t_FIM += timer.time() - ts # Step size computation # -------------------------- if self.alpha is not None: alpha = self.alpha n_step_size = (alpha ** 2) * np.dot(vpg_grad.T, npg_grad) else: n_step_size = self.n_step_size alpha = np.sqrt(np.abs(self.n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20))) # Policy update # -------------------------- curr_params = self.policy.get_param_values() new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] self.policy.set_param_values(new_params, set_new=True, set_old=True) # Log information if self.save_logs: self.logger.log_kv('alpha', alpha) self.logger.log_kv('delta', n_step_size) self.logger.log_kv('time_vpg', t_gLL) self.logger.log_kv('time_npg', t_FIM) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) return base_stats
type=str, required=True, help='location to store results') parser.add_argument('--config', type=str, required=True, help='path to config file with exp params') args = parser.parse_args() OUT_DIR = args.output if not os.path.exists(OUT_DIR): os.mkdir(OUT_DIR) with open(args.config, 'r') as f: job_data = eval(f.read()) # Unpack args and make files for easy access logger = DataLog() ENV_NAME = job_data['env_name'] EXP_FILE = OUT_DIR + '/job_data.json' SEED = job_data['seed'] # base cases if 'num_models' not in job_data.keys(): job_data['num_models'] = 1 if job_data['num_models'] == 1 or 'omega' not in job_data.keys(): job_data['omega'] = 0.0 if 'eval_rollouts' not in job_data.keys(): job_data['eval_rollouts'] = 0 if 'save_freq' not in job_data.keys(): job_data['save_freq'] = 10 if 'device_path' not in job_data.keys(): job_data['device_path'] = None
class TRPO(NPG): def __init__(self, env, policy, baseline, optim, kl_dist=0.01, FIM_invert_args={ 'iters': 10, 'damping': 1e-4 }, hvp_sample_frac=1.0, seed=None, save_logs=False, normalized_step_size=0.01): """ All inputs are expected in mjrl's format unless specified :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance :param kl_dist: desired KL distance between steps. Overrides normalized_step_size. :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well) :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow) :param seed: random seed """ self.env = env self.policy = policy self.baseline = baseline self.optim = optim self.kl_dist = kl_dist if kl_dist is not None else 0.5 * normalized_step_size self.seed = seed self.save_logs = save_logs self.FIM_invert_args = FIM_invert_args self.hvp_subsample = hvp_sample_frac self.running_score = None self.n_steps = 0 if save_logs: self.logger = DataLog() def train_from_paths(self, paths): # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) # Advantage whitening advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) # NOTE : advantage should be zero mean in expectation # normalized step size invariant to advantage scaling, # but scaling can help with least squares self.n_steps += len(advantages) # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] self.running_score = mean_return if self.running_score is None else \ 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 t_FIM = 0.0 # Optimization algorithm # -------------------------- self.optim.zero_grad() surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] # VPG ts = timer.time() vpg_grad = self.flat_vpg(observations, actions, advantages) vector_to_gradients(Variable(torch.from_numpy(vpg_grad).float()), self.policy.trainable_params) t_gLL += timer.time() - ts # NPG # Note: unlike the standard NPG, negation is not needed here since the optimizer does not # apply the update step. ts = timer.time() closure = self.kl_closure(self.policy, observations, actions, self.policy_kl_fn) info = self.optim.step(closure, execute_update=False) npg_grad = info['natural_grad'].data.numpy() t_FIM += timer.time() - ts # Step size computation # -------------------------- n_step_size = 2.0 * self.kl_dist alpha = np.sqrt( np.abs(n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20))) # Policy update # -------------------------- curr_params = self.policy.get_param_values() for k in range(100): new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] if kl_dist < self.kl_dist: break else: alpha = 0.9 * alpha # backtrack # print("Step size too high. Backtracking. | kl = %f | surr diff = %f" % \ # (kl_dist, surr_after-surr_before) ) if k == 99: alpha = 0.0 new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] self.policy.set_param_values(new_params, set_new=True, set_old=True) # Log information if self.save_logs: self.logger.log_kv('alpha', alpha) self.logger.log_kv('delta', n_step_size) self.logger.log_kv('time_vpg', t_gLL) self.logger.log_kv('time_npg', t_FIM) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) self.logger.log_kv('steps', self.n_steps) return base_stats