def __init__(self, env_dim, act_dim, inner_lr=None, **kwargs):
     assert inner_lr is not None
     super().__init__(env_dim, act_dim, 1, **kwargs)
     self.pi = NN([env_dim] + list([64, 64]) + [act_dim], out_fn=F.softmax)
     self._lst_adam = [
         Adam(var.shape, stepsize=inner_lr) for var in self.backprop_params
     ]
 def __init__(self, env_dim, act_dim, inner_lr=None, **kwargs):
     assert inner_lr is not None
     super().__init__(env_dim, act_dim, 2, **kwargs)
     self._pi = NN([env_dim] + list([64, 64]) + [act_dim],
                   out_fn=lambda x: x)
     self._logstd = C.Variable(np.zeros(act_dim, dtype=np.float32))
     self._lst_adam = [
         Adam(var.shape, stepsize=inner_lr) for var in self.backprop_params
     ]
    def __init__(self,
                 env_dim,
                 act_dim,
                 policy_output_params,
                 memory_out_size=None,
                 inner_n_opt_steps=None,
                 inner_opt_batch_size=None,
                 inner_use_ppo=None,
                 mem=None,
                 buffer_size=None):
        assert inner_n_opt_steps is not None
        assert inner_opt_batch_size is not None
        assert inner_use_ppo is not None
        self._use_ppo = inner_use_ppo
        if self._use_ppo:
            self._ppo_gam = 0.99
            self._ppo_lam = 0.95
            self._ppo_klcoeff = 0.001
            self._ppo_clipparam = 0.2
            self._vf = NN([env_dim] + list([64, 64]) + [1], out_fn=lambda x: x)

        self.pi = None
        self._logstd = None
        self._use_mem = mem
        self._buffer_size = buffer_size

        self.inner_n_opt_steps = inner_n_opt_steps
        self.inner_opt_batch_size = inner_opt_batch_size

        self._mem_out_size = memory_out_size
        self._mem = Memory(64, self._mem_out_size)

        self.lst_rew_bonus_eval = [
            HashingBonusEvaluator(dim_key=128, obs_processed_flat_dim=env_dim)
        ]

        self._env_dim = env_dim
        self._act_dim = act_dim

        # obs_dim, act_dim, rew, aux, done, pi params
        self._traj_in_dim = env_dim + act_dim + len(
            self.lst_rew_bonus_eval
        ) + 2 + policy_output_params * act_dim + self._mem_out_size

        self._loss = Conv1DLoss(traj_dim_in=self._traj_in_dim)
        self._traj_norm = Normalizer(
            (env_dim + act_dim + len(self.lst_rew_bonus_eval) + 2, ))