Example #1
0
 def __init__(self, obs_dim, act_dim, normalizer, gamma, tau):
     self.policy_net = StochasticPolicy(obs_dim, act_dim, 300, normalizer).to(device)
     self.value_net = Value(obs_dim, hidden_dim=300, normalizer=normalizer).to(device)
     self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=3e-4)
     self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=3e-4)
     self.type = 'PPO'
     self.gamma = gamma
     self.tau = tau
Example #2
0
    def __init__(self, obs_dim, act_dim, normalizer):
        self.policy_net = StochasticPolicy(obs_dim,
                                           act_dim,
                                           hidden_dim=300,
                                           normalizer=normalizer)
        self.value_net = Value(obs_dim, hidden_dim=300, normalizer=normalizer)

        self.type = 'TRPO'
Example #3
0
class TRPO(object):
    def __init__(self, obs_dim, act_dim, normalizer):
        self.policy_net = StochasticPolicy(obs_dim,
                                           act_dim,
                                           hidden_dim=300,
                                           normalizer=normalizer)
        self.value_net = Value(obs_dim, hidden_dim=300, normalizer=normalizer)

        self.type = 'TRPO'

    def get_actor(self):
        return self.policy_net

    def to_train(self):
        self.policy_net.train()
        self.value_net.train()

    def to_eval(self):
        self.policy_net.eval()
        self.value_net.eval()

    def cpu(self):
        self.policy_net.cpu()
        self.value_net.cpu()

    def to(self, device):
        self.policy_net.to(device)
        self.value_net.to(device)

    def train(self,
              batch,
              entropy_coef=1e-3,
              gamma=0.995,
              tau=0.97,
              l2_reg=1e-3,
              max_kl=1e-2,
              damping=1e-1):
        self.cpu()
        update_params(batch, self.policy_net, self.value_net, gamma, tau,
                      l2_reg, max_kl, damping, entropy_coef)
Example #4
0
from utils.normalizer import Normalizer
from models.agent import StochasticPolicy, Policy

env, env_name = flow_env(render=False, use_inflows=True)
print("simulated task: {}".format(env_name))

act_dim = env.action_space.shape[0]
obs_dim = env.observation_space.shape[0]
print(obs_dim)
normalizer = Normalizer(obs_dim)

filename = 'ppo_340000'
#filename = 'td3_shortgreenpenalty_1332000'
### load RL policy ###
if 'ppo' in filename:
    actor = StochasticPolicy(obs_dim, act_dim, 300,
                             normalizer=normalizer).to(device)
elif 'td3' in filename:
    actor = Policy(obs_dim, act_dim, hidden_dim=400,
                   normalizer=normalizer).to(device)
else:
    raise NotImplementedError

checkpoint = torch.load('./model_log/' + filename)
actor.load_state_dict(checkpoint['model_state_dict'])
reward_sum = 0.

for i in range(1):
    state = env.reset()
    for j in range(100000):
        s = torch.from_numpy(state.reshape(1, -1)).float().to(device)
        #print(actor(s))
Example #5
0
class PPO(object):
    def __init__(self, obs_dim, act_dim, normalizer, gamma, tau):
        self.policy_net = StochasticPolicy(obs_dim, act_dim, 300, normalizer).to(device)
        self.value_net = Value(obs_dim, hidden_dim=300, normalizer=normalizer).to(device)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=3e-4)
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=3e-4)
        self.type = 'PPO'
        self.gamma = gamma
        self.tau = tau

    def get_actor(self):
        return self.policy_net

    def to_train(self):
        self.policy_net.train()
        self.value_net.train()

    def to_eval(self):
        self.policy_net.eval()
        self.value_net.eval()

    def cpu(self):
        self.policy_net.cpu()
        self.value_net.cpu()

    def to(self, device):
        self.policy_net.to(device)
        self.value_net.to(device)

    def train(self, batch, entropy_coef=1e-3, n_iter=1, batch_size=16, clip_param=0.2):
        states = torch.Tensor(batch.state).to(device)
        actions = torch.Tensor(batch.action).to(device)
        returns, advantages = gae(batch, self.value_net, self.gamma, self.tau)

        #returns = (returns - returns.mean()) / (returns.std() + 1e-8)

        mean, log_std, std = self.policy_net(states)
        old_policy = log_density(actions, mean, std, log_std).detach()
        old_values = self.value_net(states).detach()

        for _ in range(n_iter):
            index = np.random.permutation(returns.shape[0]) 
            index = np.array_split(index, returns.shape[0] // batch_size)
            for idx in index:
                batch_states = states[idx, :]
                batch_actions = actions[idx, :]
                batch_returns = returns[idx, :]
                batch_advantages = advantages[idx, :]
                batch_old_values = old_values[idx, :]
                batch_old_policy = old_policy[idx, :]

                loss, ratio = surrogate_loss(self.policy_net, batch_advantages, batch_states,
                                            batch_old_policy, batch_actions)

                values = self.value_net(batch_states)
                clipped_values = batch_old_values + \
                                torch.clamp(values - batch_old_values, -clip_param, clip_param)
                value_loss1 = (clipped_values - batch_returns).pow(2)
                value_loss2 = (values - batch_returns).pow(2)
                value_loss = torch.max(value_loss1, value_loss2).mean()

                self.value_optimizer.zero_grad()
                value_loss.backward()
                self.value_optimizer.step()

                clipped_ratio = torch.clamp(ratio, 1 - clip_param, 1 + clip_param)
                clipped_loss = clipped_ratio * batch_advantages
                loss = -torch.min(loss, clipped_loss).mean()

                self.policy_optimizer.zero_grad()
                loss.backward()
                self.policy_optimizer.step()