def _create_networks(env, config): """ Creates all networks necessary for SAC. These networks have to be created before instantiating this class and used in the constructor. TODO: Maybe this should be reworked one day... Args: config: A configuration dictonary. Returns: A dictonary which contains the networks. """ obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = config['rl_algorithm_config']['net_size'] hidden_sizes = [net_size] * config['rl_algorithm_config']['network_depth'] # hidden_sizes = [net_size, net_size, net_size] qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ).to(device=ptu.device) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ).to(device=ptu.device) qf1_target = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ).to(device=ptu.device) qf2_target = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ).to(device=ptu.device) policy = TanhGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, ).to(device=ptu.device) clip_value = 1.0 for p in qf1.parameters(): p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value)) for p in qf2.parameters(): p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value)) for p in policy.parameters(): p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value)) return {'qf1' : qf1, 'qf2' : qf2, 'qf1_target' : qf1_target, 'qf2_target' : qf2_target, 'policy' : policy}
class MlpModel(DynamicsModel): def __init__(self, env, n_layers=3, hidden_layer_size=64, optimizer_class=optim.Adam, learning_rate=1e-3, reward_weight=1, **kwargs): super().__init__(env=env, **kwargs) self.env = env obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) self.input_dim = obs_dim self.action_dim = action_dim self.next_obs_dim = obs_dim self.n_layers = n_layers self.hidden_layer_size = hidden_layer_size self.learning_rate = learning_rate self.reward_weight = reward_weight self.reset() self.reward_dim = 1 #terminal_dim = 1 self.net = FlattenMlp( hidden_sizes=[hidden_layer_size] * n_layers, input_size=self.input_dim + self.action_dim, output_size=self.next_obs_dim + self.reward_dim, ) self.net_optimizer = optimizer_class(self.net.parameters(), lr=learning_rate) def to(self, device=None): if device == None: device = ptu.device self.net.to(device) def _forward(self, state, action): output = self.net(state, action) next_state = output[:, :-self.reward_dim] reward = output[:, -self.reward_dim:] terminal = 0 env_info = {} return next_state, reward, terminal, env_info def step(self, action): action = ptu.from_numpy(action[np.newaxis, :]) next_state, reward, terminal, env_info = self._forward( self.state, action) self.state = next_state next_state = np.squeeze(ptu.get_numpy(next_state)) reward = np.squeeze(ptu.get_numpy(reward)) return next_state, reward, terminal, env_info def train(self, paths): states = ptu.from_numpy(paths["observations"]) actions = ptu.from_numpy(paths["actions"]) rewards = ptu.from_numpy(paths["rewards"]) next_states = ptu.from_numpy(paths["next_observations"]) terminals = paths["terminals"] next_state_preds, reward_preds, terminal_preds, env_infos = self._forward( states, actions) self.net_optimizer.zero_grad() self.transition_model_loss = torch.mean( (next_state_preds - next_states)**2) self.reward_model_loss = torch.mean((reward_preds - rewards)**2) self.net_loss = self.transition_model_loss + self.reward_weight * self.reward_model_loss self.net_loss.backward() self.net_optimizer.step()