def gen_ro(self, log_prefix='', to_log=False): ro = self._gen_ro() self._ndata += ro.n_samples if to_log: log_rollout_info(ro, prefix=log_prefix) logz.log_tabular(log_prefix + 'NumberOfDataPoints', self._ndata) return ro
def timed(msg): print(colorize(msg, color='magenta'), end='', flush=True) tstart = time.perf_counter() yield t = time.perf_counter() - tstart print(colorize(" in %.3f seconds" % (t), color='magenta')) logz.log_tabular(msg + ' Time', t)
def _update_func_approx(self, x, y, w, to_log=False, log_prefix=''): """ Update the function approximator based on the current data (x, y, w) or through self._agg_data which is up-to-date with (x, y, w). """ # initial loss loss_before = self._compute_loss(x, y, w) # just on the current sample? explained_variance_before = math_utils.compute_explained_variance( self.predict(x), y) # optimization self.prepare_for_update(x) x_agg, y_agg, w_agg = self._agg_data['x'], self._agg_data[ 'y'], self._agg_data['w'] lr = self._update_with_lr_search( x_agg, y_agg, w_agg) # using aggregated data to update # new loss loss_after = self._compute_loss(x, y, w) explained_variance_after = math_utils.compute_explained_variance( self.predict(x), y) if to_log: logz.log_tabular( 'LossBefore({}){}'.format(self.name, log_prefix), loss_before) logz.log_tabular( 'LossAfter({}){}'.format(self.name, log_prefix), loss_after) logz.log_tabular( 'ExplainedVarianceBefore({}){}'.format( self.name, log_prefix), explained_variance_before) logz.log_tabular( 'ExplainedVarianceAfter({}){}'.format( self.name, log_prefix), explained_variance_after) logz.log_tabular( 'UsedLearningRate({}){}'.format(self.name, log_prefix), lr)
def run_alg(self, n_itrs, save_policy=True, save_policy_fun=None, save_freq=3, save_value_fun=None, save_sim_fun=None, pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs): start_time = time.time() if pretrain: # algorithm-specific if rollout_kwargs is None: gr = self._gen_ro_raw elif (rollout_kwargs['max_n_rollouts'] is None and rollout_kwargs['min_n_samples'] is None): gr = self._gen_ro_raw else: gr = functools.partial(generate_rollout, env=self._env, **rollout_kwargs) self._alg.pretrain(gr, **other_pretrain_kwargs) # Main loop for itr in range(n_itrs): logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr) with timed('Generate env rollouts'): ro = self.gen_ro(to_log=True) # algorithm-specific if save_policy and isinstance(save_freq, int) and itr % save_freq == 0: mean_val = logz.get_val_from_LOG('MeanSumOfRewards') prefix = 'iter_{}_eval_'.format(itr) + '%.0f' % mean_val save_policy_fun(prefix + '_pi') save_value_fun(prefix + '_vfn') save_sim_fun(prefix + 'sim') self._alg.update(ro, gen_env_ro=self._gen_ro) logz.dump_tabular() # dump log # Save the final policy. if save_policy: save_policy_fun('final') cprint('Final policy has been saved.')
def _update(self, env_ro, gen_env_ro): # gen_env_ro is just used for computing gradient std. assert gen_env_ro is not None '''Set _ro to self._or ''' with timed('Update Oracle'): # self.set_ro(env_ro) self._or.update(env_ro, update_nor=True, to_log=True, itr=self._itr) with timed('Compute Grad'): grads = self._or.compute_grad(ret_comps=True) grad = grads[0] if self.gradients is None: self.gradients = grad else: self.gradients = np.concatenate([self.gradients, grad], axis=0) names = ['g', 'mc_g', 'ac_os', 'tau_os', 'dr_grad_os'] for g, name in zip(grads, names): logz.log_tabular('norm_{}'.format(name), la.norm(g)) self.accum_ac += grads[2] self.accum_tau += grads[3] self.accum_func += grads[4] logz.log_tabular('norm_accum_ac_os', la.norm(self.accum_ac / self.gradients.shape[0])) logz.log_tabular('norm_accum_tau_os', la.norm(self.accum_tau / self.gradients.shape[0])) logz.log_tabular('norm_accum_func_os', la.norm(self.accum_func / self.gradients.shape[0])) self._itr += 1 logz.log_tabular('std', np.mean(self._policy.std))
def update(self, ro, update_nor=False, shift_adv=False, to_log=False, log_prefix=''): """ Args: ro: RO object representing the new information update_nor: whether to update the control variate of tfLikelihoodRatioOracle shift_adv: whether to force the adv values to be positive. if float, it specifies the amount to shift. """ self._ro = ro # save the ref to rollouts # Compute adv. advs, vfns = self._ae.advs(ro) # adv has its own ref_policy adv = np.concatenate(advs) if shift_adv: # make adv non-negative assert self._use_log_loss if shift_adv is True: adv = adv - np.min(adv) else: adv = adv - np.mean(adv) + shift_adv self._nor.reset() # defined in tfLikelihoodRatioOracle update_nor = False if not self._normalize_weighting: if self._avg_type == 'sum': # rescale the problem if needed adv *= len(adv) / len(ro) # Update the loss function. if self._use_log_loss is True: # - E_{ob} E_{ac ~ q | ob} [ w * log p(ac|ob) * adv(ob, ac) ] if self._onestep_weighting: # consider importance weight w_or_logq = np.concatenate( self._ae.weights(ro, policy=self.policy)) # helper function else: w_or_logq = np.ones_like(adv) else: # False or None # - E_{ob} E_{ac ~ q | ob} [ p(ac|ob)/q(ac|ob) * adv(ob, ac) ] assert self._onestep_weighting w_or_logq = ro.lps if to_log: vfn = np.concatenate(vfns) logz.log_tabular('max_adv', np.amax(np.abs(adv))) logz.log_tabular('max_vfn', np.amax(np.abs(vfn))) # Update the tfLikelihoodRatioOracle. super().update(-adv, w_or_logq, [ro.obs, ro.acs], update_nor) # loss is negative reward
def run_alg(self, n_itrs, pretrain=True, save_policy=False, save_freq=100, final_eval=False): start_time = time.time() if pretrain: # algorithm-specific self._alg.pretrain(functools.partial(self.gen_ro, to_log=False)) # Main loop for itr in range(n_itrs): logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr) with timed('Generate env rollouts'): ro = self.gen_ro(self._alg.pi_ro, logp=self._alg.logp, to_log=True) self._alg.update(ro) # algorithm-specific logz.dump_tabular() # dump log
def cal_variance(self, n_itrs, save_policy=True, save_value_fun=None, save_policy_fun=None, save_freq=3, save_sim_fun=None, ro_file=None, save_np_file_path=None, prefix=None, save_gradient=None, pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs): start_time = time.time() assert prefix is not None assert ro_file is not None assert save_np_file_path is not None save_grad_frequency = 1 ro_path = ro_file with open(ro_path, 'rb') as f: ros = pickle.load(f) # Main loop itr = 0 self._alg.reset_grads() while True: logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr) self._alg.compute_grad(ros.pop(0), gen_env_ro=self._gen_ro) logz.dump_tabular() # dump log if itr % save_grad_frequency == 0: np.save(save_np_file_path, self._alg.gradients) if ros == []: break itr += 1
def run_alg(self, n_itrs, save_policy=None, save_policy_fun=None, save_freq=None, pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs): start_time = time.time() if pretrain: # algorithm-specific if rollout_kwargs is None: gr = self._gen_ro_raw elif (rollout_kwargs['max_n_rollouts'] is None and rollout_kwargs['min_n_samples'] is None): gr = self._gen_ro_raw else: gr = functools.partial(generate_rollout, env=self._env, **rollout_kwargs) self._alg.pretrain(gr, **other_pretrain_kwargs) # Main loop for itr in range(n_itrs): logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr) with timed('Generate env rollouts'): ro = self.gen_ro(to_log=True) # algorithm-specific self._alg.update(ro, gen_env_ro=self._gen_ro) logz.dump_tabular() # dump log if save_policy and isinstance(save_freq, int) and itr % save_freq == 0: save_policy_fun('{}'.format(itr)) # Save the final policy. if save_policy: save_policy_fun('final') cprint('Final policy has been saved.')
def est_mean(self, n_itrs, save_policy=True, save_value_fun=None, save_policy_fun=None, save_freq=3, save_sim_fun=None, save_gradient=None, save_np_file_path=None, pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs): start_time = time.time() # Main loop # default estimator number is 10000 for itr in range(2000): ro = self._gen_ro() logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr) # algorithm-specific self._alg.compute_grad(ro, gen_env_ro=self._gen_ro) logz.dump_tabular() # dump log mean_st = self._alg.gradients est_mean = np.mean(mean_st, axis=0, keepdims=True) np.save(save_np_file_path, est_mean)
def update(self, g=None, to_log=False, *args, **kwargs): assert g is not None # Compute V. if self.w is None: # initialization (V is not needed) self.dim = g.shape[0] self.V = self._compute_V() # XXX for logging else: assert self.V is not None # make sure compute_grad has been queried pred_error_size = la.norm(np.dot(self.V, self.w) - g) # Update the most recent oracle using new samples (rotate right). oracle = self._base_oracles.pop() # pop the most right element oracle.update(to_log=to_log, *args, **kwargs) self._base_oracles.appendleft(oracle) if self.n_valid_base_oracles < self.n_base_oracles: self.n_valid_base_oracles += 1 # Regression using true grads if self.mode == 'average': self.w = np.zeros(self.n_base_oracles) self.w[:self. n_valid_base_oracles] = 1.0 / self.n_valid_base_oracles elif self.mode == 'recent': self.w = np.zeros(self.n_base_oracles) self.w[0] = 1.0 else: if self.w is None: # initialization. cst * 1/2 * (w - e_1)^2 self.w = np.zeros(self.n_base_oracles) self.w[0] = 1.0 self.A = (self.reg_factor * la.norm(g)**2 / self.n_base_oracles) * np.eye(self.n_base_oracles) self.b = np.dot(self.A, self.w) else: self.A = (1.0 - self.mode) * self.A + np.matmul( self.V.T, self.V) self.b = (1.0 - self.mode) * self.b + np.matmul(self.V.T, g) self.w = la.solve(self.A, self.b) self.w = np.clip(self.w, 0.0, 2.0) # XXX if to_log: logz.log_tabular('min_weights', np.min(self.w)) logz.log_tabular('max_weights', np.max(self.w)) logz.log_tabular('norm_weights', la.norm(self.w)) # Reset V. self.V = None
def _update(self, env_ro, gen_env_ro): # gen_env_ro is just used for computing gradient std. assert gen_env_ro is not None # XXX If using simulation to train vf, vf should be updated after policy nor is updated. if self.gen_sim_ro is not None: with timed('Generate sim data'): sim_ro = self.gen_sim_ro() with timed('Update ae'): self._or.update_ae(sim_ro, to_log=True) # update value function if self.log_sigmas_freq is not None and self._itr % self.log_sigmas_freq == 0: with timed('Compute Sigmas'): self._or.log_sigmas(**self.log_sigmas_kwargs) with timed('Update Oracle'): self._or.update(env_ro, update_nor=True, to_log=True, itr=self._itr) with timed('Compute Grad'): grads = self._or.compute_grad(ret_comps=True) grad = grads[0] names = ['g', 'mc_g', 'ac_os', 'tau_os'] for g, name in zip(grads, names): logz.log_tabular('norm_{}'.format(name), la.norm(g)) with timed('Take Gradient Step'): self._learner.update(grad, self._or.ro) # take the grad with the env_ro if self.gen_sim_ro is None: with timed('Update ae'): self._or.update_ae(env_ro, to_log=True) # update value function # Always update dynamics using true data. with timed('Update dyn'): self._or.update_dyn(env_ro, to_log=True) # update dynamics with timed('Update rw'): self._or.update_rw(env_ro, to_log=True) self._itr += 1 logz.log_tabular('online_learner_stepsize', self._learner.stepsize) logz.log_tabular('std', np.mean(self._policy.std))
def log_rollout_info(ro, prefix=''): # print('Logging rollout info') if not hasattr(log_rollout_info, "total_n_samples"): log_rollout_info.total_n_samples = {} # static variable if prefix not in log_rollout_info.total_n_samples: log_rollout_info.total_n_samples[prefix] = 0 sum_of_rewards = [rollout.rws.sum() for rollout in ro.rollouts] rollout_lens = [len(rollout) for rollout in ro.rollouts] n_samples = sum(rollout_lens) log_rollout_info.total_n_samples[prefix] += n_samples logz.log_tabular(prefix + "NumSamplesThisBatch", n_samples) logz.log_tabular(prefix + "NumberOfRollouts", len(ro)) logz.log_tabular(prefix + "TotalNumSamples", log_rollout_info.total_n_samples[prefix]) logz.log_tabular(prefix + "MeanSumOfRewards", np.mean(sum_of_rewards)) logz.log_tabular(prefix + "StdSumOfRewards", np.std(sum_of_rewards)) logz.log_tabular(prefix + "MaxSumOfRewards", np.max(sum_of_rewards)) logz.log_tabular(prefix + "MinSumOfRewards", np.min(sum_of_rewards)) logz.log_tabular(prefix + "MeanRolloutLens", np.mean(rollout_lens)) logz.log_tabular(prefix + "StdRolloutLens", np.std(rollout_lens)) logz.log_tabular( prefix + "MeanOfRewards", np.sum(sum_of_rewards) / (n_samples + len(sum_of_rewards)))
def update(self, ro): self._ro = ro if not self._ignore_samples: # update input normalizer for whitening self._policy.prepare_for_update(self._ro.obs) # Correction Step (Model-free) self._correction() # end of round self._itr += 1 # log logz.log_tabular('pcl_stepsize', self._pcl.stepsize) logz.log_tabular('std', np.mean(self._policy.std)) if not self._ignore_samples: logz.log_tabular('true_grads_size', np.linalg.norm(self._g)) logz.log_tabular('pred_grads_size', np.linalg.norm(self._pcl.g_hat)) pred_error_size = np.linalg.norm(self._g - self._pcl.g_hat) ratio = pred_error_size / np.linalg.norm(self._g) logz.log_tabular('pred_error_size', pred_error_size) logz.log_tabular('pred_error_true_ratio', ratio) # Prediction Step (Model-based) if self._w_pred: self._prediction() # log logz.log_tabular('std_after', np.mean(self._policy.std))
def log_sigmas(self, idx=100, n_ros=30, n_acs=30, n_taus=30, n_steps=None, use_vf=False): # Estimate the vairance of G_idx for different cvs for comparison. # n_steps, rollout for max n_steps for tau. # use_vf: use value function to reduce the variance in estimate E_a E_tau NQ. # Collect samples. # Data structure: # sts: 2d array. # acs: 3d array. # advs (advantage function): 3d array. # N (log probability gradient): 3d array. # XXX # Use state baseline to reduce the variance of the estimates. ro = self.gen_ro(max_n_rollouts=n_ros, max_rollout_len=idx + 1) sts = np.array([r.obs[idx] for r in ro.rollouts if len(r) > idx]) n_sts = len(sts) if n_sts == 0: log = { 'sigma_s_mc': .0, 'sigma_a_mc': .0, 'sigma_tau_mc': .0, 'n_ros_in_total': n_sts * n_acs * n_taus, 'n_sts': n_sts, } else: acs = self.policy.pi(np.repeat(sts, n_acs, axis=0)) acs = np.reshape(acs, [n_sts, n_acs, -1]) Q = np.zeros((n_ros, n_acs, n_taus)) N_dim = len(self.policy.logp_grad(ro.obs[0], ro.acs[0])) N = np.zeros((n_ros, n_acs, N_dim)) decay = self.ae._pe.gamma * self.delta for i, s in enumerate(sts): for j, a in enumerate(acs[i]): # This should be the bottleneck!! ro = self.gen_ro(max_n_rollouts=n_taus, max_rollout_len=n_steps, start_state=s, start_action=a) N[i, j] = self.policy.logp_grad(s, a) for k, r in enumerate(ro.rollouts): q0 = ((decay**np.arange(len(r))) * r.rws).sum() Q[i, j, k] = q0 # Fill the rest with zeros. if use_vf: V = np.zeros((n_ros)) for i, s in enumerate(sts): V[i] = self.ae._vfn.predict(s[None])[0] def compute_sigma_s(Q): E_tau_Q = np.mean(Q, axis=2) # s x a if use_vf: E_tau_Q -= np.expand_dims(V, axis=-1) # s x 1 E_tau_Q = np.expand_dims(E_tau_Q, axis=-1) # s x a x 1 E_a_tau_NQ = np.mean(E_tau_Q * N, axis=1) # s x N E_s_a_tau_NQ = np.mean(E_a_tau_NQ, axis=0) # N E_s_a_tau_NQ = np.expand_dims(E_s_a_tau_NQ, axis=0) # 1 x N Var = np.mean(np.square(E_a_tau_NQ - E_s_a_tau_NQ), axis=0) # N sigma = np.sqrt(np.sum(Var)) return sigma def compute_sigma_a(Q): E_tau_Q = np.mean(Q, axis=2) # s x a E_tau_Q = np.expand_dims(E_tau_Q, axis=-1) # s x a x 1 N_E_tau_Q = N * E_tau_Q # s x a x N if use_vf: N_E_tau_Q_for_E_a = N * (E_tau_Q - np.reshape(V, V.shape + (1, 1))) else: N_E_tau_Q_for_E_a = N_E_tau_Q E_a_N_E_tau_Q = np.mean(N_E_tau_Q_for_E_a, axis=1) # s x N E_a_N_E_tau_Q = np.expand_dims(E_a_N_E_tau_Q, axis=1) # s x 1 x N Var = np.mean(np.square(N_E_tau_Q - E_a_N_E_tau_Q), axis=1) # s x N sigma = np.sqrt(np.sum(np.mean(Var, axis=0))) return sigma def compute_sigma_tau(Q): E_tau_Q = np.mean(Q, axis=2) # s x a E_tau_Q = np.expand_dims(E_tau_Q, axis=-1) # s x a x 1 Var = np.mean(np.square(Q - E_tau_Q), axis=2) # s x a Var = np.expand_dims(Var, axis=-1) # s x a x 1 sigma = np.sqrt( np.sum(np.mean(np.square(N) * Var, axis=(0, 1)))) return sigma log = { 'sigma_s_mc': compute_sigma_s(Q), 'sigma_a_mc': compute_sigma_a(Q), 'sigma_tau_mc': compute_sigma_tau(Q), 'n_ros_in_total': n_sts * n_acs * n_taus, 'n_sts': n_sts, } for k, v in log.items(): logz.log_tabular(k, v)