class BC: def __init__(self, expert_paths, policy, epochs = 5, batch_size = 64, lr = 1e-3, optimizer = None): self.policy = policy self.expert_paths = expert_paths self.epochs = epochs self.mb_size = batch_size self.logger = DataLog() # get transformations observations = np.concatenate([path["observations"] for path in expert_paths]) actions = np.concatenate([path["actions"] for path in expert_paths]) in_shift, in_scale = np.mean(observations, axis=0), np.std(observations, axis=0) out_shift, out_scale = np.mean(actions, axis=0), np.std(actions, axis=0) # set scalings in the target policy self.policy.model.set_transformations(in_shift, in_scale, out_shift, out_scale) self.policy.old_model.set_transformations(in_shift, in_scale, out_shift, out_scale) # construct optimizer self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=lr) if optimizer is None else optimizer def loss(self, obs, act): LL, mu, log_std = self.policy.new_dist_info(obs, act) # minimize negative log likelihood return -torch.mean(LL) def train(self): observations = np.concatenate([path["observations"] for path in self.expert_paths]) actions = np.concatenate([path["actions"] for path in self.expert_paths]) params_before_opt = self.policy.get_param_values() ts = timer.time() num_samples = observations.shape[0] for ep in tqdm(range(self.epochs)): self.logger.log_kv('epoch', ep) loss_val = self.loss(observations, actions).data.numpy().ravel()[0] self.logger.log_kv('loss', loss_val) self.logger.log_kv('time', (timer.time()-ts)) for mb in range(int(num_samples / self.mb_size)): rand_idx = np.random.choice(num_samples, size=self.mb_size) obs = observations[rand_idx] act = actions[rand_idx] self.optimizer.zero_grad() loss = self.loss(obs, act) loss.backward() self.optimizer.step() params_after_opt = self.policy.get_param_values() self.policy.set_param_values(params_after_opt, set_new=True, set_old=True) self.logger.log_kv('epoch', self.epochs) loss_val = self.loss(observations, actions).data.numpy().ravel()[0] self.logger.log_kv('loss', loss_val) self.logger.log_kv('time', (timer.time()-ts))
def train(env, policy, normalizer, hp, parentPipes, args): logger = DataLog() total_steps = 0 best_return = -99999999 if os.path.isdir(args.logdir) == False: os.mkdir(args.logdir) previous_dir = os.getcwd() os.chdir(args.logdir) if os.path.isdir('iterations') == False: os.mkdir('iterations') if os.path.isdir('logs') == False: os.mkdir('logs') hp.to_text('hyperparameters') for step in range(hp.nb_steps): # Initializing the perturbations deltas and the positive/negative rewards deltas = policy.sample_deltas() positive_rewards = [0] * hp.nb_directions negative_rewards = [0] * hp.nb_directions if (parentPipes): process_count = len(parentPipes) if parentPipes: p = 0 while (p < hp.nb_directions): temp_p = p n_left = hp.nb_directions - p #Number of processes required to complete the search for k in range(min([process_count, n_left])): parentPipe = parentPipes[k] parentPipe.send([ _EXPLORE, [normalizer, policy, hp, "positive", deltas[temp_p]] ]) temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): positive_rewards[temp_p], step_count = parentPipes[k].recv( ) total_steps = total_steps + step_count temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): parentPipe = parentPipes[k] parentPipe.send([ _EXPLORE, [normalizer, policy, hp, "negative", deltas[temp_p]] ]) temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): negative_rewards[temp_p], step_count = parentPipes[k].recv( ) total_steps = total_steps + step_count temp_p = temp_p + 1 p = p + process_count # print('mp step has worked, ', p) print('total steps till now: ', total_steps, 'Processes done: ', p) else: # Getting the positive rewards in the positive directions for k in range(hp.nb_directions): positive_rewards[k] = explore(env, policy, "positive", deltas[k], hp) # Getting the negative rewards in the negative/opposite directions for k in range(hp.nb_directions): negative_rewards[k] = explore(env, policy, "negative", deltas[k], hp) # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions scores = { k: max(r_pos, r_neg) for k, ( r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards)) } order = sorted(scores.keys(), key=lambda x: -scores[x])[:int(hp.nb_best_directions)] rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order] # Gathering all the positive/negative rewards to compute the standard deviation of these rewards all_rewards = np.array([x[0] for x in rollouts] + [x[1] for x in rollouts]) sigma_r = all_rewards.std( ) # Standard deviation of only rewards in the best directions is what it should be # Updating our policy policy.update(rollouts, sigma_r, args) # Printing the final reward of the policy after the update reward_evaluation = explore(env, policy, None, None, hp) logger.log_kv('steps', total_steps) logger.log_kv('return', reward_evaluation) if (reward_evaluation > best_return): best_policy = policy.theta best_return = reward_evaluation np.save("iterations/best_policy.npy", best_policy) print('Step:', step, 'Reward:', reward_evaluation) policy_path = "iterations/" + "policy_" + str(step) np.save(policy_path, policy.theta) logger.save_log('logs/') make_train_plots_ars(log=logger.log, keys=['steps', 'return'], save_loc='logs/')
class TRPO(NPG): def __init__(self, env, policy, baseline, kl_dist=0.01, FIM_invert_args={'iters': 10, 'damping': 1e-4}, hvp_sample_frac=1.0, seed=None, save_logs=False, normalized_step_size=0.01): """ All inputs are expected in pybRL's format unless specified :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance :param kl_dist: desired KL distance between steps. Overrides normalized_step_size. :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well) :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow) :param seed: random seed """ self.env = env self.policy = policy self.baseline = baseline self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size self.seed = seed self.save_logs = save_logs self.FIM_invert_args = FIM_invert_args self.hvp_subsample = hvp_sample_frac self.running_score = None if save_logs: self.logger = DataLog() def train_from_paths(self, paths): # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) # Advantage whitening advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) # NOTE : advantage should be zero mean in expectation # normalized step size invariant to advantage scaling, # but scaling can help with least squares # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] self.running_score = mean_return if self.running_score is None else \ 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 t_FIM = 0.0 # Optimization algorithm # -------------------------- surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] # VPG ts = timer.time() vpg_grad = self.flat_vpg(observations, actions, advantages) t_gLL += timer.time() - ts # NPG ts = timer.time() hvp = self.build_Hvp_eval([observations, actions], regu_coef=self.FIM_invert_args['damping']) npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(), cg_iters=self.FIM_invert_args['iters']) t_FIM += timer.time() - ts # Step size computation # -------------------------- n_step_size = 2.0*self.kl_dist alpha = np.sqrt(np.abs(n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20))) # Policy update # -------------------------- curr_params = self.policy.get_param_values() for k in range(100): new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] if kl_dist < self.kl_dist: break else: alpha = 0.9*alpha # backtrack print("Step size too high. Backtracking. | kl = %f | surr diff = %f" % \ (kl_dist, surr_after-surr_before) ) if k == 99: alpha = 0.0 new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] self.policy.set_param_values(new_params, set_new=True, set_old=True) # Log information if self.save_logs: self.logger.log_kv('alpha', alpha) self.logger.log_kv('delta', n_step_size) self.logger.log_kv('time_vpg', t_gLL) self.logger.log_kv('time_npg', t_FIM) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) try: self.env.env.env.evaluate_success(paths, self.logger) except: # nested logic for backwards compatibility. TODO: clean this up. try: success_rate = self.env.env.env.evaluate_success(paths) self.logger.log_kv('success_rate', success_rate) except: pass return base_stats
def train(env, policy, normalizer, hp, job_name="default_exp"): """ Training using Augmented Random Search :param env : OpenAI gym environment :param policy : Object of class Policy :param normalizer : Object of class normalizer :param hp : Object of class hp :param job_name : Name of the directory where you want to save data :returns : Nothing, trains the agent """ logger = DataLog() total_steps = 0 best_return = -99999999 if os.path.isdir(job_name) == False: os.mkdir(job_name) previous_dir = os.getcwd() os.chdir(job_name) if os.path.isdir('iterations') == False: os.mkdir('iterations') if os.path.isdir('logs') == False: os.mkdir('logs') hp.to_text('hyperparameters') for step in range(hp.nb_steps): # Initializing the perturbations deltas and the positive/negative rewards deltas = policy.sample_deltas(hp) positive_rewards = [0] * hp.nb_directions negative_rewards = [0] * hp.nb_directions # Getting the positive rewards in the positive directions for k in range(hp.nb_directions): positive_rewards[k], step_count_positive = explore( env, normalizer, policy, "positive", deltas[k], hp) # break # print('done: ',k) # Getting the negative rewards in the negative/opposite directions for k in range(hp.nb_directions): negative_rewards[k], step_count_negative = explore( env, normalizer, policy, "negative", deltas[k], hp) # break # print('done: ', k) total_steps = total_steps + step_count_positive + step_count_negative # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions scores = { k: max(r_pos, r_neg) for k, ( r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards)) } order = sorted(scores.keys(), key=lambda x: -scores[x])[:hp.nb_best_directions] rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order] # Gathering all the positive/negative rewards to compute the standard deviation of these rewards all_rewards = np.array([x[0] for x in rollouts] + [x[1] for x in rollouts]) sigma_r = all_rewards.std( ) # Standard deviation of only rewards in the best directions is what it should be # Updating our policy policy.update(rollouts, sigma_r, args) # Printing the final reward of the policy after the update reward_evaluation, _ = explore(env, normalizer, policy, None, None, hp) logger.log_kv('steps', total_steps) logger.log_kv('return', reward_evaluation) if (reward_evaluation > best_return): best_policy = policy.theta best_return = reward_evaluation np.save("iterations/best_policy.npy", best_policy) print('Step:', step, 'Reward:', reward_evaluation) policy_path = "iterations/" + "policy_" + str(step) np.save(policy_path, policy.theta) logger.save_log('logs/') make_train_plots_ars(log=logger.log, keys=['steps', 'return'], save_loc='logs/')
class DAPG(NPG): def __init__( self, env, policy, baseline, demo_paths=None, normalized_step_size=0.01, FIM_invert_args={ 'iters': 10, 'damping': 1e-4 }, hvp_sample_frac=1.0, seed=None, save_logs=False, kl_dist=None, lam_0=1.0, # demo coef lam_1=0.95, # decay coef ): self.env = env self.policy = policy self.baseline = baseline self.kl_dist = kl_dist if kl_dist is not None else 0.5 * normalized_step_size self.seed = seed self.save_logs = save_logs self.FIM_invert_args = FIM_invert_args self.hvp_subsample = hvp_sample_frac self.running_score = None self.demo_paths = demo_paths self.lam_0 = lam_0 self.lam_1 = lam_1 self.iter_count = 0.0 if save_logs: self.logger = DataLog() def train_from_paths(self, paths): # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) if self.demo_paths is not None and self.lam_0 > 0.0: demo_obs = np.concatenate( [path["observations"] for path in self.demo_paths]) demo_act = np.concatenate( [path["actions"] for path in self.demo_paths]) demo_adv = self.lam_0 * (self.lam_1**self.iter_count) * np.ones( demo_obs.shape[0]) self.iter_count += 1 # concatenate all all_obs = np.concatenate([observations, demo_obs]) all_act = np.concatenate([actions, demo_act]) all_adv = 1e-2 * np.concatenate( [advantages / (np.std(advantages) + 1e-8), demo_adv]) else: all_obs = observations all_act = actions all_adv = advantages # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] self.running_score = mean_return if self.running_score is None else \ 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 t_FIM = 0.0 # Optimization algorithm # -------------------------- surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] # DAPG ts = timer.time() sample_coef = all_adv.shape[0] / advantages.shape[0] dapg_grad = sample_coef * self.flat_vpg(all_obs, all_act, all_adv) t_gLL += timer.time() - ts # NPG ts = timer.time() hvp = self.build_Hvp_eval([observations, actions], regu_coef=self.FIM_invert_args['damping']) npg_grad = cg_solve(hvp, dapg_grad, x_0=dapg_grad.copy(), cg_iters=self.FIM_invert_args['iters']) t_FIM += timer.time() - ts # Step size computation # -------------------------- n_step_size = 2.0 * self.kl_dist alpha = np.sqrt( np.abs(n_step_size / (np.dot(dapg_grad.T, npg_grad) + 1e-20))) # Policy update # -------------------------- curr_params = self.policy.get_param_values() new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] self.policy.set_param_values(new_params, set_new=True, set_old=True) # Log information if self.save_logs: self.logger.log_kv('alpha', alpha) self.logger.log_kv('delta', n_step_size) self.logger.log_kv('time_vpg', t_gLL) self.logger.log_kv('time_npg', t_FIM) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) try: self.env.env.env.evaluate_success(paths, self.logger) except: # nested logic for backwards compatibility. TODO: clean this up. try: success_rate = self.env.env.env.evaluate_success(paths) self.logger.log_kv('success_rate', success_rate) except: pass return base_stats
class BatchREINFORCE: def __init__(self, env, policy, baseline, learn_rate=0.01, seed=None, save_logs=False): self.env = env self.policy = policy self.baseline = baseline self.alpha = learn_rate self.seed = seed self.save_logs = save_logs self.running_score = None if save_logs: self.logger = DataLog() def compute_reinforce_loss(self, observations, actions, advantages): """ Computes the REINFORCE function loss. :param observations : A list containing observations of all paths together :param actions : A list containing actions of all paths together :param advantages : A list containing advantages of all paths together :return : Loss of all the paths combined (Loss according to reinforce algorithm log(policy) * advantage) """ adv_var = Variable(torch.from_numpy(advantages).float(), requires_grad=False) old_dist_info = self.policy.old_dist_info(observations, actions) new_dist_info = self.policy.new_dist_info(observations, actions) log_policy = self.policy.likelihood_ratio(new_dist_info, old_dist_info) loss = torch.mean(log_policy * adv_var) return loss def kl_old_new(self, observations, actions): old_dist_info = self.policy.old_dist_info(observations, actions) new_dist_info = self.policy.new_dist_info(observations, actions) mean_kl = self.policy.mean_kl(new_dist_info, old_dist_info) return mean_kl def flat_vpg(self, observations, actions, advantages): """ Finds the gradients with respect to the REINFORCE loss :param observations : A list containing observations of all paths together :param actions : A list containing actions of all paths together :param advantages : A list containing advantages of all paths together :return : A flat list containing gradients of the weights (as calculated by PyTorch) """ loss = self.compute_reinforce_loss(observations, actions, advantages) if (np.isinf(loss.data.numpy())): pdb.set_trace() vpg_grad = torch.autograd.grad(loss, self.policy.trainable_params) vpg_grad = np.concatenate( [g.contiguous().view(-1).data.numpy() for g in vpg_grad]) return vpg_grad def train_step(self, N, sample_mode='trajectories', env_name=None, T=1e6, gamma=0.995, gae_lambda=0.98, num_cpu='max'): """ The agent performs a single complete step. It samples trajectories, finds gradients and updates it's weights based on those gradients :param N : Number of runs to get :param sample_mode : If trajectories, it uses trajectory sampler (N= 5, implies 5 different trajectories) If samples, it uses batch smapler (N = 5, implies 5 different samples only) :param env_name : Name of env :param T : Maximum length of trajectory :param gamma : Discount Factor :param gae_lambda : Eligibility trace :param num_cpu : Number of cores to use (On real systems has to be 1) :return : A set of statistics regarding the current step. It returns [Mean_Return, Standard_Return, Min_Return, Max_Return] """ # Clean up input arguments if env_name is None: env_name = self.env.env_id if sample_mode != 'trajectories' and sample_mode != 'samples': print( "sample_mode in NPG must be either 'trajectories' or 'samples'" ) quit() ts = timer.time() if sample_mode == 'trajectories': paths = trajectory_sampler.sample_paths_parallel( N, self.policy, T, env_name, self.seed, num_cpu) elif sample_mode == 'samples': paths = batch_sampler.sample_paths(N, self.policy, T, env_name=env_name, pegasus_seed=self.seed, num_cpu=num_cpu) if self.save_logs: self.logger.log_kv('time_sampling', timer.time() - ts) self.seed = self.seed + N if self.seed is not None else self.seed # compute returns process_samples.compute_returns(paths, gamma) # compute advantages process_samples.compute_advantages(paths, self.baseline, gamma, gae_lambda) # train from paths eval_statistics = self.train_from_paths(paths) eval_statistics.append(N) # fit baseline if self.save_logs: ts = timer.time() error_before, error_after = self.baseline.fit(paths, return_errors=True) self.logger.log_kv('time_VF', timer.time() - ts) self.logger.log_kv('VF_error_before', error_before) self.logger.log_kv('VF_error_after', error_after) else: self.baseline.fit(paths) return eval_statistics # ---------------------------------------------------------- def train_from_paths(self, paths): """ Performs the gradient step for a given set of paths :param paths : Paths refers to a list of dictionaries as output by samplers :return : Returns a list that contains statistics for the paths. [Mean_Return, Standard_Return, Min_Return, Max_Return] The gradient update step is performed internally """ # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) # Advantage whitening advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] self.running_score = mean_return if self.running_score is None else \ 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 # Optimization algorithm # -------------------------- loss_before_training = self.compute_reinforce_loss( observations, actions, advantages).data.numpy().ravel()[0] # VPG ts = timer.time() vpg_grad = self.flat_vpg(observations, actions, advantages) t_gLL += timer.time() - ts # Policy update # -------------------------- curr_params = self.policy.get_param_values() new_params = curr_params + self.alpha * vpg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) loss_after_training = self.compute_reinforce_loss( observations, actions, advantages).data.numpy().ravel()[0] kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] self.policy.set_param_values(new_params, set_new=True, set_old=True) # Log information if self.save_logs: self.logger.log_kv('alpha', self.alpha) self.logger.log_kv('time_vpg', t_gLL) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('loss_improvement', loss_after_training - loss_before_training) self.logger.log_kv('running_score', self.running_score) try: self.env.env.env.evaluate_success(paths, self.logger) except: # nested logic for backwards compatibility. TODO: clean this up. try: success_rate = self.env.env.env.evaluate_success(paths) self.logger.log_kv('success_rate', success_rate) except: pass return base_stats def log_rollout_statistics(self, paths): path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) self.logger.log_kv('stoc_pol_mean', mean_return) self.logger.log_kv('stoc_pol_std', std_return) self.logger.log_kv('stoc_pol_max', max_return) self.logger.log_kv('stoc_pol_min', min_return)
class BC: def __init__(self, expert_paths, policy, epochs=5, batch_size=64, lr=1e-3, optimizer=None): self.policy = policy self.expert_paths = expert_paths self.epochs = epochs self.mb_size = batch_size self.logger = DataLog() # get transformations observations = np.concatenate( [path["observations"] for path in expert_paths]) actions = np.concatenate([path["actions"] for path in expert_paths]) in_shift, in_scale = np.mean(observations, axis=0), np.std(observations, axis=0) out_shift, out_scale = np.mean(actions, axis=0), np.std(actions, axis=0) # set scalings in the target policy self.policy.model.set_transformations(in_shift, in_scale, out_shift, out_scale) self.policy.old_model.set_transformations(in_shift, in_scale, out_shift, out_scale) # set the variance of gaussian policy based on out_scale params = self.policy.get_param_values() params[-self.policy.m:] = np.log(out_scale + 1e-12) self.policy.set_param_values(params) # construct optimizer self.optimizer = torch.optim.Adam( self.policy.model.parameters(), lr=lr) if optimizer is None else optimizer # loss criterion is MSE for maximum likelihood estimation self.loss_function = torch.nn.MSELoss() def loss(self, obs, act): obs_var = Variable(torch.from_numpy(obs).float(), requires_grad=False) act_var = Variable(torch.from_numpy(act).float(), requires_grad=False) act_hat = self.policy.model(obs_var) return self.loss_function(act_hat, act_var.detach()) def train(self): observations = np.concatenate( [path["observations"] for path in self.expert_paths]) actions = np.concatenate( [path["actions"] for path in self.expert_paths]) params_before_opt = self.policy.get_param_values() ts = timer.time() num_samples = observations.shape[0] for ep in tqdm(range(self.epochs)): self.logger.log_kv('epoch', ep) loss_val = self.loss(observations, actions).data.numpy().ravel()[0] self.logger.log_kv('loss', loss_val) self.logger.log_kv('time', (timer.time() - ts)) for mb in range(int(num_samples / self.mb_size)): rand_idx = np.random.choice(num_samples, size=self.mb_size) obs = observations[rand_idx] act = actions[rand_idx] self.optimizer.zero_grad() loss = self.loss(obs, act) loss.backward() self.optimizer.step() params_after_opt = self.policy.get_param_values() self.policy.set_param_values(params_after_opt, set_new=True, set_old=True) self.logger.log_kv('epoch', self.epochs) loss_val = self.loss(observations, actions).data.numpy().ravel()[0] self.logger.log_kv('loss', loss_val) self.logger.log_kv('time', (timer.time() - ts))
class PPO(BatchREINFORCE): def __init__(self, env, policy, baseline, clip_coef=0.2, epochs=10, mb_size=64, learn_rate=3e-4, seed=0, save_logs=False): self.env = env self.policy = policy self.baseline = baseline self.learn_rate = learn_rate self.seed = seed self.save_logs = save_logs self.clip_coef = clip_coef self.epochs = epochs self.mb_size = mb_size self.running_score = None if save_logs: self.logger = DataLog() self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=learn_rate) def PPO_surrogate(self, observations, actions, advantages): adv_var = Variable(torch.from_numpy(advantages).float(), requires_grad=False) old_dist_info = self.policy.old_dist_info(observations, actions) new_dist_info = self.policy.new_dist_info(observations, actions) LR = self.policy.likelihood_ratio(new_dist_info, old_dist_info) LR_clip = torch.clamp(LR, min=1 - self.clip_coef, max=1 + self.clip_coef) ppo_surr = torch.mean(torch.min(LR * adv_var, LR_clip * adv_var)) return ppo_surr # ---------------------------------------------------------- def train_from_paths(self, paths): # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) # Advantage whitening advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) # NOTE : advantage should be zero mean in expectation # normalized step size invariant to advantage scaling, # but scaling can help with least squares # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] self.running_score = mean_return if self.running_score is None else \ 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters if self.save_logs: self.log_rollout_statistics(paths) # Optimization algorithm # -------------------------- surr_before = self.compute_reinforce_loss( observations, actions, advantages).data.numpy().ravel()[0] params_before_opt = self.policy.get_param_values() ts = timer.time() num_samples = observations.shape[0] for ep in range(self.epochs): for mb in range(int(num_samples / self.mb_size)): rand_idx = np.random.choice(num_samples, size=self.mb_size) obs = observations[rand_idx] act = actions[rand_idx] adv = advantages[rand_idx] self.optimizer.zero_grad() loss = -self.PPO_surrogate(obs, act, adv) loss.backward() self.optimizer.step() params_after_opt = self.policy.get_param_values() surr_after = self.compute_reinforce_loss( observations, actions, advantages).data.numpy().ravel()[0] kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] self.policy.set_param_values(params_after_opt, set_new=True, set_old=True) t_opt = timer.time() - ts # Log information if self.save_logs: self.logger.log_kv('t_opt', t_opt) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) try: self.env.env.env.evaluate_success(paths, self.logger) except: # nested logic for backwards compatibility. TODO: clean this up. try: success_rate = self.env.env.env.evaluate_success(paths) self.logger.log_kv('success_rate', success_rate) except: pass return base_stats
class NPG(BatchREINFORCE): def __init__(self, env, policy, baseline, normalized_step_size=0.01, const_learn_rate=None, FIM_invert_args={ 'iters': 10, 'damping': 1e-4 }, hvp_sample_frac=1.0, seed=None, save_logs=False, kl_dist=None): """ All inputs are expected in pybRL's format unless specified :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance :param kl_dist: desired KL distance between steps. Overrides normalized_step_size. :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well) :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow) :param seed: random seed """ self.env = env self.policy = policy self.baseline = baseline self.alpha = const_learn_rate self.n_step_size = normalized_step_size if kl_dist is None else 2.0 * kl_dist self.seed = seed self.save_logs = save_logs self.FIM_invert_args = FIM_invert_args self.hvp_subsample = hvp_sample_frac self.running_score = None if save_logs: self.logger = DataLog() def HVP(self, observations, actions, vector, regu_coef=None): """ Computes the hessian vector product for a set of observations and actions :param observations : A numpy nd array of observations :param actions : A numpy nd array of actions :param vector : Input vector to multiply the gradients by :param regu_coef : It is just a coefficient that makes the Vector Product non singular (positive definite) :return : The hessian vector product as a 1d np array """ regu_coef = self.FIM_invert_args[ 'damping'] if regu_coef is None else regu_coef vec = Variable(torch.from_numpy(vector).float(), requires_grad=False) if self.hvp_subsample is not None and self.hvp_subsample < 0.99: num_samples = observations.shape[0] rand_idx = np.random.choice(num_samples, size=int(self.hvp_subsample * num_samples)) obs = observations[rand_idx] act = actions[rand_idx] else: obs = observations act = actions old_dist_info = self.policy.old_dist_info(obs, act) new_dist_info = self.policy.new_dist_info(obs, act) mean_kl = self.policy.mean_kl(new_dist_info, old_dist_info) grad_fo = torch.autograd.grad(mean_kl, self.policy.trainable_params, create_graph=True) flat_grad = torch.cat([g.contiguous().view(-1) for g in grad_fo]) h = torch.sum(flat_grad * vec) hvp = torch.autograd.grad(h, self.policy.trainable_params) hvp_flat = np.concatenate( [g.contiguous().view(-1).data.numpy() for g in hvp]) return hvp_flat + regu_coef * vector def build_Hvp_eval(self, inputs, regu_coef=None): """ Returns a function that computes A*v for a vector v. This is basically needed to use conjugate gradients algorithm :param inputs : A list containing [observations, actions] :param regu_coef : A coefficient that makes it positive definite/ non singular not sure :return : A function that calculates A*v. It will be used in the cg_solve method """ def eval(v): full_inp = inputs + [v] + [regu_coef] Hvp = self.HVP(*full_inp) return Hvp return eval # ---------------------------------------------------------- def train_from_paths(self, paths): """ Trains the agent from paths using Natural Policy Gradients :param paths: List of dictionaries as taken by the sampler :return : Performs the gradient step internally. Returns a list containing [mean_return, std_return, min_return, max_return] for the given paths. """ # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) # Advantage whitening advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) # NOTE : advantage should be zero mean in expectation # normalized step size invariant to advantage scaling, # but scaling can help with least squares # cache renv = e.MinitaurBulletEnv(render=True) #cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] self.running_score = mean_return if self.running_score is None else \ 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 t_FIM = 0.0 # Optimization algorithm # -------------------------- reinforce_loss_before = self.compute_reinforce_loss( observations, actions, advantages).data.numpy().ravel()[0] # VPG ts = timer.time() vpg_grad = self.flat_vpg(observations, actions, advantages) t_gLL += timer.time() - ts # NPG ts = timer.time() hvp = self.build_Hvp_eval([observations, actions], regu_coef=self.FIM_invert_args['damping']) npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(), cg_iters=self.FIM_invert_args['iters']) t_FIM += timer.time() - ts # Step size computation # -------------------------- if self.alpha is not None: alpha = self.alpha n_step_size = (alpha**2) * np.dot(vpg_grad.T, npg_grad) else: n_step_size = self.n_step_size alpha = np.sqrt( np.abs(self.n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20))) # Policy update # -------------------------- curr_params = self.policy.get_param_values() new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) reinforce_loss_after = self.compute_reinforce_loss( observations, actions, advantages).data.numpy().ravel()[0] kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] self.policy.set_param_values(new_params, set_new=True, set_old=True) # Log information if self.save_logs: self.logger.log_kv('alpha', alpha) self.logger.log_kv('delta', n_step_size) self.logger.log_kv('time_vpg', t_gLL) self.logger.log_kv('time_npg', t_FIM) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', reinforce_loss_after - reinforce_loss_before) self.logger.log_kv('running_score', self.running_score) try: self.env.env.env.evaluate_success(paths, self.logger) except: # nested logic for backwards compatibility. TODO: clean this up. try: success_rate = self.env.env.env.evaluate_success(paths) self.logger.log_kv('success_rate', success_rate) except: pass return base_stats