def test_pad_tensor(self): results = pad_tensor(self.tensor, self.max_len) assert len(self.tensor) == 3 assert np.array_equal(results, [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]) results = pad_tensor(self.tensor, self.max_len, mode='last') assert np.array_equal(results, [1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
def sliding_window(t, window, step_size, smear=False): if window > t.shape[0]: raise ValueError("`window` must be <= `t.shape[0]`") elif window == t.shape[0]: return np.stack([t] * window) # TODO(gh/19): this is broken for other step sizes. The problem may be with # the transpose trick if step_size != 1: raise NotImplementedError # The stride trick works only on the last dimension of an ndarray, so we # operate on the transpose, which reverses the dimensions of t. t_T = t.T shape = t_T.shape[:-1] + (t_T.shape[-1] - window + 1 - step_size, window) strides = t_T.strides + (t_T.strides[-1] * step_size, ) t_T_win = np.lib.stride_tricks.as_strided(t_T, shape=shape, strides=strides) # t_T_win has shape (d_k, d_k-1, ..., (n - window_size), window_size) # To arrive at the final shape, we first transpose the result to arrive at # (window_size, (n - window_size), d_1, ..., d_k), then swap the firs two # axes t_win = np.swapaxes(t_T_win.T, 0, 1) # Optionally smear the last element to preserve the first dimension if smear: t_win = tensor_utils.pad_tensor(t_win, t.shape[0], mode='last') return t_win
def optimize_policy(self, itr, samples_data): # Init vars rewards = samples_data['rewards'] actions = samples_data['actions'] observations = samples_data['observations'] agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] if self.policy.recurrent: recurrent_vals = [samples_data["valids"]] else: recurrent_vals = [] # Compute sample Bellman error. feat_diff = [] for path in samples_data['paths']: feats = self._features(path) feats = np.vstack([feats, np.zeros(feats.shape[1])]) feat_diff.append(feats[1:] - feats[:-1]) if self.policy.recurrent: max_path_length = max( [len(path["advantages"]) for path in samples_data["paths"]]) # pad feature diffs feat_diff = np.array([ tensor_utils.pad_tensor(fd, max_path_length) for fd in feat_diff ]) else: feat_diff = np.vstack(feat_diff) ################# # Optimize dual # ################# # Here we need to optimize dual through BFGS in order to obtain \eta # value. Initialize dual function g(\theta, v). \eta > 0 # First eval delta_v f_dual = self.opt_info['f_dual'] f_dual_grad = self.opt_info['f_dual_grad'] # Set BFGS eval function def eval_dual(input): param_eta = input[0] param_v = input[1:] val = f_dual(*([rewards, feat_diff] + state_info_list + recurrent_vals + [param_eta, param_v])) return val.astype(np.float64) # Set BFGS gradient eval function def eval_dual_grad(input): param_eta = input[0] param_v = input[1:] grad = f_dual_grad(*([rewards, feat_diff] + state_info_list + recurrent_vals + [param_eta, param_v])) eta_grad = np.float(grad[0]) v_grad = grad[1] return np.hstack([eta_grad, v_grad]) # Initial BFGS parameter values. x0 = np.hstack([self.param_eta, self.param_v]) # Set parameter boundaries: \eta>0, v unrestricted. bounds = [(-np.inf, np.inf) for _ in x0] bounds[0] = (0., np.inf) # Optimize through BFGS logger.log('optimizing dual') eta_before = x0[0] dual_before = eval_dual(x0) params_ast, _, _ = self.optimizer(func=eval_dual, x0=x0, fprime=eval_dual_grad, bounds=bounds, maxiter=self.max_opt_itr, disp=0) dual_after = eval_dual(params_ast) # Optimal values have been obtained self.param_eta = params_ast[0] self.param_v = params_ast[1:] ################### # Optimize policy # ################### cur_params = self.policy.get_param_values(trainable=True) f_loss = self.opt_info["f_loss"] f_loss_grad = self.opt_info['f_loss_grad'] input = [ rewards, observations, feat_diff, actions ] + state_info_list + recurrent_vals + [self.param_eta, self.param_v] # Set loss eval function def eval_loss(params): self.policy.set_param_values(params, trainable=True) val = f_loss(*input) return val.astype(np.float64) # Set loss gradient eval function def eval_loss_grad(params): self.policy.set_param_values(params, trainable=True) grad = f_loss_grad(*input) flattened_grad = tensor_utils.flatten_tensors( list(map(np.asarray, grad))) return flattened_grad.astype(np.float64) loss_before = eval_loss(cur_params) logger.log('optimizing policy') params_ast, _, _ = self.optimizer(func=eval_loss, x0=cur_params, fprime=eval_loss_grad, disp=0, maxiter=self.max_opt_itr) loss_after = eval_loss(params_ast) f_kl = self.opt_info['f_kl'] mean_kl = f_kl(*([observations, actions] + state_info_list + dist_info_list + recurrent_vals)).astype(np.float64) logger.log('eta %f -> %f' % (eta_before, self.param_eta)) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) logger.record_tabular('DualBefore', dual_before) logger.record_tabular('DualAfter', dual_after) logger.record_tabular('MeanKL', mean_kl)
def process_samples(self, itr, paths): baselines = [] returns = [] if hasattr(self.algo.baseline, 'predict_n'): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path['rewards'] + \ self.algo.discount * path_baselines[1:] - path_baselines[:-1] path['advantages'] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path['returns'] = special.discount_cumsum(path['rewards'], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path['returns']) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path['observations'] for path in paths]) actions = tensor_utils.concat_tensor_list( [path['actions'] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path['rewards'] for path in paths]) returns = tensor_utils.concat_tensor_list( [path['returns'] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path['advantages'] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path['env_infos'] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path['agent_infos'] for path in paths]) if self.algo.center_adv: advantages = utils.center_advantages(advantages) if self.algo.positive_adv: advantages = utils.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path['returns'][0] for path in paths]) undiscounted_returns = [sum(path['rewards']) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path['advantages']) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path['advantages'] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path['advantages'] - adv_mean) / adv_std for path in paths] else: adv = [path['advantages'] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path['returns'][0] for path in paths]) undiscounted_returns = [sum(path['rewards']) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log('fitting baseline...') if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log('fitted') tabular.record('Iteration', itr) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('ExplainedVariance', ev) tabular.record('NumTrajs', len(paths)) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) return samples_data