def __init__(self, obs_size, action_space, n_hidden_layers=2, n_hidden_channels=64, bound_mean=None, normalize_obs=None): assert bound_mean in [False, True] assert normalize_obs in [False, True] super().__init__() hidden_sizes = (n_hidden_channels, ) * n_hidden_layers self.normalize_obs = normalize_obs with self.init_scope(): self.pi = policies.FCGaussianPolicyWithStateIndependentCovariance( obs_size, action_space.low.size, n_hidden_layers, n_hidden_channels, var_type='diagonal', nonlinearity=F.tanh, bound_mean=bound_mean, min_action=action_space.low, max_action=action_space.high, mean_wscale=1e-2) self.v = links.MLP(obs_size, 1, hidden_sizes=hidden_sizes) if self.normalize_obs: self.obs_filter = links.EmpiricalNormalization(shape=obs_size)
def make_model(self, env): n_hidden_channels = 20 n_dim_obs = env.observation_space.low.size v = v_functions.FCVFunction( n_dim_obs, n_hidden_layers=1, n_hidden_channels=n_hidden_channels, nonlinearity=F.tanh, last_wscale=0.01, ) if self.discrete: n_actions = env.action_space.n pi = policies.FCSoftmaxPolicy( n_dim_obs, n_actions, n_hidden_layers=1, n_hidden_channels=n_hidden_channels, nonlinearity=F.tanh, last_wscale=0.01, ) else: n_dim_actions = env.action_space.low.size pi = policies.FCGaussianPolicyWithStateIndependentCovariance( n_dim_obs, n_dim_actions, n_hidden_layers=1, n_hidden_channels=n_hidden_channels, nonlinearity=F.tanh, mean_wscale=0.01, var_type='diagonal', ) # Check if KL div supports double-backprop fake_obs = np.zeros_like(env.observation_space.low, dtype=np.float32) action_distrib = pi(fake_obs[None]) kl = action_distrib.kl(action_distrib) old_style_funcs = trpo._find_old_style_function([kl]) if old_style_funcs: self.skipTest("\ Chainer v{} does not support double backprop of these functions: {}.".format( chainer.__version__, old_style_funcs)) return pi, v