def __init__(self, env_dim, act_dim, inner_lr=None, **kwargs): assert inner_lr is not None super().__init__(env_dim, act_dim, 1, **kwargs) self.pi = NN([env_dim] + list([64, 64]) + [act_dim], out_fn=F.softmax) self._lst_adam = [ Adam(var.shape, stepsize=inner_lr) for var in self.backprop_params ]
def __init__(self, env_dim, act_dim, inner_lr=None, **kwargs): assert inner_lr is not None super().__init__(env_dim, act_dim, 2, **kwargs) self._pi = NN([env_dim] + list([64, 64]) + [act_dim], out_fn=lambda x: x) self._logstd = C.Variable(np.zeros(act_dim, dtype=np.float32)) self._lst_adam = [ Adam(var.shape, stepsize=inner_lr) for var in self.backprop_params ]
def __init__(self, env_dim, act_dim, policy_output_params, memory_out_size=None, inner_n_opt_steps=None, inner_opt_batch_size=None, inner_use_ppo=None, mem=None, buffer_size=None): assert inner_n_opt_steps is not None assert inner_opt_batch_size is not None assert inner_use_ppo is not None self._use_ppo = inner_use_ppo if self._use_ppo: self._ppo_gam = 0.99 self._ppo_lam = 0.95 self._ppo_klcoeff = 0.001 self._ppo_clipparam = 0.2 self._vf = NN([env_dim] + list([64, 64]) + [1], out_fn=lambda x: x) self.pi = None self._logstd = None self._use_mem = mem self._buffer_size = buffer_size self.inner_n_opt_steps = inner_n_opt_steps self.inner_opt_batch_size = inner_opt_batch_size self._mem_out_size = memory_out_size self._mem = Memory(64, self._mem_out_size) self.lst_rew_bonus_eval = [ HashingBonusEvaluator(dim_key=128, obs_processed_flat_dim=env_dim) ] self._env_dim = env_dim self._act_dim = act_dim # obs_dim, act_dim, rew, aux, done, pi params self._traj_in_dim = env_dim + act_dim + len( self.lst_rew_bonus_eval ) + 2 + policy_output_params * act_dim + self._mem_out_size self._loss = Conv1DLoss(traj_dim_in=self._traj_in_dim) self._traj_norm = Normalizer( (env_dim + act_dim + len(self.lst_rew_bonus_eval) + 2, ))