def train_from_paths(self, paths): observations, actions, advantages, base_stats, self.running_score = self.process_paths( paths) if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 t_FIM = 0.0 # normalize inputs if necessary if self.input_normalization: data_in_shift, data_in_scale = np.mean( observations, axis=0), np.std(observations, axis=0) pi_in_shift, pi_in_scale = self.policy.model.in_shift.data.numpy( ), self.policy.model.in_scale.data.numpy() pi_out_shift, pi_out_scale = self.policy.model.out_shift.data.numpy( ), self.policy.model.out_scale.data.numpy() pi_in_shift = self.input_normalization * pi_in_shift + ( 1 - self.input_normalization) * data_in_shift pi_in_scale = self.input_normalization * pi_in_scale + ( 1 - self.input_normalization) * data_in_scale self.policy.model.set_transformations(pi_in_shift, pi_in_scale, pi_out_shift, pi_out_scale) # Optimization algorithm # -------------------------- surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] # VPG ts = timer.time() vpg_grad = self.flat_vpg(observations, actions, advantages) t_gLL += timer.time() - ts # NPG ts = timer.time() hvp = self.build_Hvp_eval([observations, actions], regu_coef=self.FIM_invert_args['damping']) npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(), cg_iters=self.FIM_invert_args['iters']) t_FIM += timer.time() - ts # Step size computation # -------------------------- if self.alpha is not None: alpha = self.alpha n_step_size = (alpha**2) * np.dot(vpg_grad.T, npg_grad) else: n_step_size = self.n_step_size alpha = np.sqrt( np.abs(self.n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20))) # Policy update # -------------------------- curr_params = self.policy.get_param_values() new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] self.policy.set_param_values(new_params, set_new=True, set_old=True) # Log information if self.save_logs: self.logger.log_kv('alpha', alpha) self.logger.log_kv('delta', n_step_size) self.logger.log_kv('time_vpg', t_gLL) self.logger.log_kv('time_npg', t_FIM) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) try: self.env.env.env.evaluate_success(paths, self.logger) except: # nested logic for backwards compatibility. TODO: clean this up. try: success_rate = self.env.env.env.evaluate_success(paths) self.logger.log_kv('success_rate', success_rate) except: pass return base_stats
def train_from_paths(self, paths): # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) # Advantage whitening advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) # NOTE : advantage should be zero mean in expectation # normalized step size invariant to advantage scaling, # but scaling can help with least squares # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] self.running_score = mean_return if self.running_score is None else \ 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 t_FIM = 0.0 # Optimization algorithm # -------------------------- surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] # VPG ts = timer.time() vpg_grad = self.flat_vpg(observations, actions, advantages) t_gLL += timer.time() - ts # NPG ts = timer.time() hvp = self.build_Hvp_eval([observations, actions], regu_coef=self.FIM_invert_args['damping']) npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(), cg_iters=self.FIM_invert_args['iters']) t_FIM += timer.time() - ts # Step size computation # -------------------------- n_step_size = 2.0*self.kl_dist alpha = np.sqrt(np.abs(n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20))) # Policy update # -------------------------- curr_params = self.policy.get_param_values() for k in range(100): new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] if kl_dist < self.kl_dist: break else: alpha = 0.9*alpha # backtrack print("Step size too high. Backtracking. | kl = %f | surr diff = %f" % \ (kl_dist, surr_after-surr_before) ) if k == 99: alpha = 0.0 new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] self.policy.set_param_values(new_params, set_new=True, set_old=True) # Log information if self.save_logs: self.logger.log_kv('alpha', alpha) self.logger.log_kv('delta', n_step_size) self.logger.log_kv('time_vpg', t_gLL) self.logger.log_kv('time_npg', t_FIM) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) try: self.env.env.env.evaluate_success(paths, self.logger) except: # nested logic for backwards compatibility. TODO: clean this up. try: success_rate = self.env.env.env.evaluate_success(paths) self.logger.log_kv('success_rate', success_rate) except: pass return base_stats
def train_from_paths(self, paths): # Concatenate from all the trajectories observations = np.concatenate([path["observations"] for path in paths]) actions = np.concatenate([path["actions"] for path in paths]) advantages = np.concatenate([path["advantages"] for path in paths]) advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) if self.demo_paths is not None and self.lam_0 > 0.0: demo_obs = np.concatenate( [path["observations"] for path in self.demo_paths]) demo_act = np.concatenate( [path["actions"] for path in self.demo_paths]) demo_adv = self.lam_0 * (self.lam_1**self.iter_count) * np.ones( demo_obs.shape[0]) self.iter_count += 1 # concatenate all all_obs = np.concatenate([observations, demo_obs]) all_act = np.concatenate([actions, demo_act]) all_adv = 1e-2 * np.concatenate( [advantages / (np.std(advantages) + 1e-8), demo_adv]) else: all_obs = observations all_act = actions all_adv = advantages entropy = np.sum( self.policy.log_std_val + np.log(np.sqrt(2 * np.pi * np.e))) # taken from inverse_rl repo if self.save_logs: self.logger.log_kv('entropy', entropy) if self.entropy_weight > 0: all_adv = all_adv + self.entropy_weight * entropy # cache return distributions for the paths path_returns = [sum(p["rewards"]) for p in paths] mean_return = np.mean(path_returns) std_return = np.std(path_returns) min_return = np.amin(path_returns) max_return = np.amax(path_returns) base_stats = [mean_return, std_return, min_return, max_return] self.running_score = mean_return if self.running_score is None else \ 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters if self.save_logs: self.log_rollout_statistics(paths) # Keep track of times for various computations t_gLL = 0.0 t_FIM = 0.0 # Optimization algorithm # -------------------------- surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] # DAPG ts = timer.time() sample_coef = all_adv.shape[0] / advantages.shape[0] dapg_grad = sample_coef * self.flat_vpg(all_obs, all_act, all_adv) t_gLL += timer.time() - ts # NPG ts = timer.time() hvp = self.build_Hvp_eval([observations, actions], regu_coef=self.FIM_invert_args['damping']) npg_grad = cg_solve(hvp, dapg_grad, x_0=dapg_grad.copy(), cg_iters=self.FIM_invert_args['iters']) t_FIM += timer.time() - ts # Step size computation # -------------------------- n_step_size = 2.0 * self.kl_dist alpha = np.sqrt( np.abs(n_step_size / (np.dot(dapg_grad.T, npg_grad) + 1e-20))) # Policy update # -------------------------- curr_params = self.policy.get_param_values() new_params = curr_params + alpha * npg_grad self.policy.set_param_values(new_params, set_new=True, set_old=False) surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] self.policy.set_param_values(new_params, set_new=True, set_old=True) # Log information if self.save_logs: self.logger.log_kv('alpha', alpha) self.logger.log_kv('delta', n_step_size) self.logger.log_kv('time_vpg', t_gLL) self.logger.log_kv('time_npg', t_FIM) self.logger.log_kv('kl_dist', kl_dist) self.logger.log_kv('surr_improvement', surr_after - surr_before) self.logger.log_kv('running_score', self.running_score) try: self.env.env.env.evaluate_success(paths, self.logger) except: # nested logic for backwards compatibility. TODO: clean this up. try: success_rate = self.env.env.env.evaluate_success(paths) self.logger.log_kv('success_rate', success_rate) except: pass return base_stats