Ejemplo n.º 1
0
class TRPO(object):
    def __init__(self, obs_dim, act_dim, normalizer):
        self.policy_net = StochasticPolicy(obs_dim,
                                           act_dim,
                                           hidden_dim=300,
                                           normalizer=normalizer)
        self.value_net = Value(obs_dim, hidden_dim=300, normalizer=normalizer)

        self.type = 'TRPO'

    def get_actor(self):
        return self.policy_net

    def to_train(self):
        self.policy_net.train()
        self.value_net.train()

    def to_eval(self):
        self.policy_net.eval()
        self.value_net.eval()

    def cpu(self):
        self.policy_net.cpu()
        self.value_net.cpu()

    def to(self, device):
        self.policy_net.to(device)
        self.value_net.to(device)

    def train(self,
              batch,
              entropy_coef=1e-3,
              gamma=0.995,
              tau=0.97,
              l2_reg=1e-3,
              max_kl=1e-2,
              damping=1e-1):
        self.cpu()
        update_params(batch, self.policy_net, self.value_net, gamma, tau,
                      l2_reg, max_kl, damping, entropy_coef)
Ejemplo n.º 2
0
class PPO(object):
    def __init__(self, obs_dim, act_dim, normalizer, gamma, tau):
        self.policy_net = StochasticPolicy(obs_dim, act_dim, 300, normalizer).to(device)
        self.value_net = Value(obs_dim, hidden_dim=300, normalizer=normalizer).to(device)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=3e-4)
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=3e-4)
        self.type = 'PPO'
        self.gamma = gamma
        self.tau = tau

    def get_actor(self):
        return self.policy_net

    def to_train(self):
        self.policy_net.train()
        self.value_net.train()

    def to_eval(self):
        self.policy_net.eval()
        self.value_net.eval()

    def cpu(self):
        self.policy_net.cpu()
        self.value_net.cpu()

    def to(self, device):
        self.policy_net.to(device)
        self.value_net.to(device)

    def train(self, batch, entropy_coef=1e-3, n_iter=1, batch_size=16, clip_param=0.2):
        states = torch.Tensor(batch.state).to(device)
        actions = torch.Tensor(batch.action).to(device)
        returns, advantages = gae(batch, self.value_net, self.gamma, self.tau)

        #returns = (returns - returns.mean()) / (returns.std() + 1e-8)

        mean, log_std, std = self.policy_net(states)
        old_policy = log_density(actions, mean, std, log_std).detach()
        old_values = self.value_net(states).detach()

        for _ in range(n_iter):
            index = np.random.permutation(returns.shape[0]) 
            index = np.array_split(index, returns.shape[0] // batch_size)
            for idx in index:
                batch_states = states[idx, :]
                batch_actions = actions[idx, :]
                batch_returns = returns[idx, :]
                batch_advantages = advantages[idx, :]
                batch_old_values = old_values[idx, :]
                batch_old_policy = old_policy[idx, :]

                loss, ratio = surrogate_loss(self.policy_net, batch_advantages, batch_states,
                                            batch_old_policy, batch_actions)

                values = self.value_net(batch_states)
                clipped_values = batch_old_values + \
                                torch.clamp(values - batch_old_values, -clip_param, clip_param)
                value_loss1 = (clipped_values - batch_returns).pow(2)
                value_loss2 = (values - batch_returns).pow(2)
                value_loss = torch.max(value_loss1, value_loss2).mean()

                self.value_optimizer.zero_grad()
                value_loss.backward()
                self.value_optimizer.step()

                clipped_ratio = torch.clamp(ratio, 1 - clip_param, 1 + clip_param)
                clipped_loss = clipped_ratio * batch_advantages
                loss = -torch.min(loss, clipped_loss).mean()

                self.policy_optimizer.zero_grad()
                loss.backward()
                self.policy_optimizer.step()