Ejemplo n.º 1
0
    def __init__(self,
                 actor_net,
                 critic_net,
                 buffer_size=1000,
                 actor_learn_freq=1,
                 target_update_freq=0,
                 target_update_tau=5e-3,
                 learning_rate=0.0001,
                 discount_factor=0.99,
                 batch_size=100,
                 verbose=False):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau
        self.ratio_clip = 0.2
        self.lam_entropy = 0.01
        self.adv_norm = True
        self.rew_norm = False
        self.schedule_clip = False
        self.schedule_adam = False

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._target = target_update_freq > 0
        self._update_iteration = 10
        self._sync_cnt = 0
        # self._learn_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0

        self._verbose = verbose
        self._batch_size = batch_size
        self.buffer = ReplayBuffer(buffer_size, replay=False)
        # assert not self.buffer.allow_replay, 'PPO buffer cannot be replay buffer'
        self._normalized = lambda x, e: (x - x.mean()) / (x.std() + e)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.actor_eval = actor_net.to(self.device)
        self.critic_eval = critic_net.to(self.device)
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)

        self.actor_eval.train()
        self.critic_eval.train()

        if self._target:
            self.actor_target = deepcopy(self.actor_eval)
            self.critic_target = deepcopy(self.critic_eval)
            self.actor_target.load_state_dict(self.actor_eval.state_dict())
            self.critic_target.load_state_dict(self.critic_eval.state_dict())

            self.actor_target.eval()
            self.critic_target.eval()

        self.criterion = nn.SmoothL1Loss()
Ejemplo n.º 2
0
    def __init__(
        self,
        model,
        buffer_size=1000,
        learning_rate=1e-3,
        discount_factor=0.99,
        gae_lamda=1,  # mc
        verbose=False,
        num_episodes=1000,
    ):
        super().__init__()
        self.lr = learning_rate
        self.end_lr = self.lr * 0.1
        self.eps = np.finfo(np.float32).eps.item()

        self._gamma = discount_factor
        self._gae_lamda = gae_lamda  # default: 1, MC
        self._learn_cnt = 0
        self._verbose = verbose
        self.schedule_adam = True
        self.buffer = ReplayBuffer(buffer_size, replay=False)

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)

        self.criterion = nn.SmoothL1Loss()
        self.num_episodes = num_episodes
Ejemplo n.º 3
0
    def __init__(
        self, 
        model,
        buffer_size=1000,
        batch_size=100,
        actor_learn_freq=1,
        target_update_freq=5,
        target_update_tau=1e-2,
        learning_rate=1e-3,
        discount_factor=0.99,
        verbose=False,
        update_iteration=10,
        act_dim=None,
        alpha=None # default: auto_entropy_tuning
        ):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._target = target_update_freq > 0
        self._update_iteration = update_iteration
        self._sync_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = verbose
        self._batch_size = batch_size
        self.buffer = ReplayBuffer(buffer_size) # off-policy

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()

        self.actor_target = self.copy_net(self.actor_eval)
        self.critic_target = self.copy_net(self.critic_eval)
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr)
        
        self.criterion = nn.SmoothL1Loss()
        self.act_dim = act_dim
        self.alpha = alpha
        self.auto_entropy_tuning = True

        if self.alpha:
            self.auto_entropy_tuning = False
            self.value_eval = model.v_net.to(device).train()
            self.value_target = self.copy_net(self.value_eval)
            self.value_eval_optim = optim.Adam(self.value_eval.parameters(), lr=self.lr)
        else:
            self.target_entropy = -torch.tensor(1).to(device)
            self.log_alpha = torch.zeros(1, requires_grad=True, device=device)
            self.alpha_optim = optim.Adam([self.log_alpha], lr=self.lr)
            self.alpha = self.log_alpha.exp()
Ejemplo n.º 4
0
    def __init__(
            self,
            model,
            buffer_size=1000,
            actor_learn_freq=1,
            target_update_freq=0,
            target_update_tau=5e-3,
            learning_rate=0.0001,
            discount_factor=0.99,
            gae_lamda=0.95,  # td
            batch_size=100,
            verbose=False):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau
        self.ratio_clip = 0.2
        self.lam_entropy = 0.01
        self.adv_norm = False  # normalize advantage, defalut=False
        self.rew_norm = False  # normalize reward, default=False
        self.schedule_clip = False
        self.schedule_adam = False

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._gae_lam = gae_lamda
        self._target = target_update_freq > 0
        self._update_iteration = 10
        self._sync_cnt = 0
        # self._learn_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0

        self._verbose = verbose
        self._batch_size = batch_size
        self._normalized = lambda x, e: (x - x.mean()) / (x.std() + e)
        self.buffer = ReplayBuffer(buffer_size, replay=False)

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)

        # self.actor_eval.train()
        # self.critic_eval.train()

        if self._target:
            self.actor_target = self.copy_net(self.actor_eval)
            self.critic_target = self.copy_net(self.critic_eval)

        self.criterion = nn.SmoothL1Loss()
Ejemplo n.º 5
0
    def __init__(self,
                 actor_net,
                 critic_net,
                 buffer_size=1000,
                 actor_learn_freq=1,
                 target_update_freq=0,
                 target_update_tau=5e-3,
                 learning_rate=0.01,
                 discount_factor=0.99,
                 batch_size=100,
                 verbose=False):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._target = target_update_freq > 0
        self._update_iteration = 10
        self._sync_cnt = 0
        # self._learn_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = verbose
        self._batch_size = batch_size
        self.replay_buffer = ReplayBuffer(buffer_size)
        # assert buffer.allow_replay, 'DDPG buffer must be replay buffer'

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.actor_eval = actor_net.to(self.device)  # pi(s)
        self.critic_eval = critic_net.to(self.device)  # Q(s, a)
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)

        self.actor_eval.train()
        self.critic_eval.train()

        if self._target:
            self.actor_target = deepcopy(self.actor_eval)
            self.critic_target = deepcopy(self.critic_eval)
            self.actor_target.load_state_dict(self.actor_eval.state_dict())
            self.critic_target.load_state_dict(self.critic_eval.state_dict())

            self.actor_target.eval()
            self.critic_target.eval()

        self.criterion = nn.MSELoss()  # why mse?
Ejemplo n.º 6
0
    def __init__(self,
                 actor_net,
                 critic_net,
                 buffer_size=1000,
                 actor_learn_freq=1,
                 target_update_freq=0,
                 target_update_tau=5e-3,
                 learning_rate=0.01,
                 discount_factor=0.99,
                 gae_lamda=1,
                 verbose=False):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau
        self.gae_lamda = gae_lamda

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._target = target_update_freq > 0
        self._sync_cnt = 0
        # self._learn_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = verbose
        self.buffer = ReplayBuffer(buffer_size, replay=False)
        # assert not self.buffer.allow_replay, 'PPO buffer cannot be replay buffer'

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.actor_eval = actor_net.to(self.device)
        self.critic_eval = critic_net.to(self.device)
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)

        self.actor_eval.train()
        self.critic_eval.train()

        if self._target:
            self.actor_target = deepcopy(self.actor_eval)
            self.critic_target = deepcopy(self.critic_eval)
            self.actor_target.load_state_dict(self.actor_eval.state_dict())
            self.critic_target.load_state_dict(self.critic_eval.state_dict())

            self.actor_target.eval()
            self.critic_target.eval()

        self.criterion = nn.SmoothL1Loss()
Ejemplo n.º 7
0
    def __init__(self,
                 critic_net,
                 action_shape=0,
                 buffer_size=1000,
                 batch_size=100,
                 target_update_freq=1,
                 target_update_tau=1,
                 learning_rate=0.01,
                 discount_factor=0.99,
                 verbose=False):
        super().__init__()
        self.lr = learning_rate
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau
        self.epsilon = 0.5
        # ticks
        self.double_q = True
        self.dueling_q = True
        self.distributional_q = True
        self.prioritized_replay = True
        self.noisy_q = True
        self.n_step_td = True

        self.target_update_freq = target_update_freq
        self.action_shape = action_shape
        self._gamma = discount_factor
        self._batch_size = batch_size
        self._verbose = verbose
        self._update_iteration = 10
        self._learn_cnt = 0
        self._normalized = lambda x, e: (x - x.mean()) / (x.std() + e)
        self.rew_norm = True
        self.buffer = ReplayBuffer(buffer_size)

        self.critic_eval = critic_net.to(self.device)
        self.critic_target = deepcopy(self.critic_eval)
        self.critic_target.load_state_dict(self.critic_eval.state_dict())
        self.critic_eval.use_dueling = self.critic_target.use_dueling = self.dueling_q  # Dueling DQN

        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)
        self.critic_eval.train()

        self.criterion = nn.MSELoss()

        self.random_choose = 0
        self.sum_choose = 0
Ejemplo n.º 8
0
    def __init__(
        self,
        model,
        buffer_size=1000,
        actor_learn_freq=1,
        target_update_freq=1,
        target_update_tau=0.01,
        # learning_rate=3e-3,
        actor_lr=1e-4,
        critic_lr=1e-3,
        discount_factor=0.99,
        batch_size=100,
        update_iteration=10,
        verbose=False,
    ):
        super().__init__()
        # self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._update_iteration = update_iteration
        self._sync_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = verbose
        self._batch_size = batch_size
        self.buffer = ReplayBuffer(buffer_size)

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()

        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=actor_lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=critic_lr)

        self.actor_target = self.copy_net(self.actor_eval)
        self.critic_target = self.copy_net(self.critic_eval)

        self.criterion = nn.MSELoss()  # why mse?

        self.noise_clip = 0.5
        self.noise_std = 0.2
Ejemplo n.º 9
0
    def __init__(
        self, 
        model,
        buffer_size=1000,
        batch_size=100,
        actor_learn_freq=1,
        target_update_freq=5,
        target_update_tau=1e-2,
        learning_rate=1e-3,
        discount_factor=0.99,
        verbose=False,
        update_iteration=10,
        act_dim=None,
        alpha=1.0,
        ):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._target = target_update_freq > 0
        self._update_iteration = update_iteration
        self._sync_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = verbose
        self._batch_size = batch_size
        self.buffer = ReplayBuffer(buffer_size) # off-policy

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()
        self.value_eval = model.v_net.to(device).train()

        self.value_target = self.copy_net(self.value_eval)
        
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr)
        self.value_eval_optim = optim.Adam(self.value_eval.parameters(), lr=self.lr)

        self.criterion = nn.SmoothL1Loss()
        self.act_dim = act_dim
        self.alpha = alpha
Ejemplo n.º 10
0
    def __init__(
        self,
        model,
        buffer_size=1e6,
        batch_size=256,
        policy_freq=2,
        tau=0.005,
        discount=0.99,
        policy_lr=3e-4,
        value_lr=3e-4,
        learn_iteration=1,
        verbose=False,
        act_dim=None,
        alpha=1.0,
    ):
        super().__init__()
        self.tau = tau
        self.gamma = discount
        self.policy_freq = policy_freq
        self.learn_iteration = learn_iteration
        self.verbose = verbose
        self.act_dim = act_dim
        self.batch_size = batch_size
        self.buffer = ReplayBuffer(buffer_size)  # off-policy

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()
        self.value_eval = model.v_net.to(device).train()

        self.value_target = self.copy_net(self.value_eval)

        self.actor_eval_optim = torch.optim.Adam(self.actor_eval.parameters(),
                                                 lr=policy_lr)
        self.critic_eval_optim = torch.optim.Adam(
            self.critic_eval.parameters(), lr=value_lr)
        self.value_eval_optim = torch.optim.Adam(self.value_eval.parameters(),
                                                 lr=value_lr)

        self.criterion = nn.SmoothL1Loss()

        self.alpha = alpha
        self.eps = np.finfo(np.float32).eps.item()
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
Ejemplo n.º 11
0
    def __init__(
        self,
        model,
        buffer_size=1000,
        actor_learn_freq=1,
        target_update_freq=1,
        target_update_tau=0.005,
        learning_rate=1e-4,
        discount_factor=0.99,
        batch_size=100,
        update_iteration=10,
        verbose=False,
        act_dim=None,
        num_episodes=1000,
    ):
        super().__init__()
        self.lr = learning_rate
        self.end_lr = learning_rate * 0.1
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._update_iteration = update_iteration
        self._sync_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = False
        self._batch_size = batch_size
        self.schedule_adam = True
        self.buffer = ReplayBuffer(buffer_size)

        self.actor_eval = model.policy_net.to(device).train()  # pi(s)
        self.critic_eval = model.value_net.to(device).train()  # Q(s, a)
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr)

        self.actor_target = self.copy_net(self.actor_eval)
        self.critic_target = self.copy_net(self.critic_eval)

        self.criterion = nn.MSELoss()  # why mse?
        self.act_dim = act_dim
        self.num_episodes = num_episodes
Ejemplo n.º 12
0
    def __init__(self,
                 model,
                 action_shape=0,
                 buffer_size=1000,
                 batch_size=100,
                 target_update_freq=1,
                 target_update_tau=1,
                 learning_rate=0.01,
                 discount_factor=0.99,
                 verbose=False):
        super().__init__()
        self.lr = learning_rate
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau
        self.target_update_freq = target_update_freq
        # self.action_shape = action_shape
        self._gamma = discount_factor
        self._batch_size = batch_size
        self._verbose = verbose
        self._update_iteration = 10
        self._learn_cnt = 0
        self._normalized = lambda x, e: (x - x.mean()) / (x.std() + e)
        self.rew_norm = True
        self.buffer = ReplayBuffer(buffer_size)

        # self.declare_networks()
        self.critic_eval = model.value_net.to(self.device).train()
        self.critic_target = self.copy_net(self.critic_eval)

        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)
        # self.critic_eval.train()

        self.criterion = nn.MSELoss()

        self.random_choose = 0
        self.sum_choose = 0
Ejemplo n.º 13
0
class SAC1(BasePolicy):  # pg_net + q_net + v_net
    def __init__(
        self,
        model,
        buffer_size=1e6,
        batch_size=256,
        policy_freq=2,
        tau=0.005,
        discount=0.99,
        policy_lr=3e-4,
        value_lr=3e-4,
        learn_iteration=1,
        verbose=False,
        act_dim=None,
        alpha=1.0,
    ):
        super().__init__()
        self.tau = tau
        self.gamma = discount
        self.policy_freq = policy_freq
        self.learn_iteration = learn_iteration
        self.verbose = verbose
        self.act_dim = act_dim
        self.batch_size = batch_size
        self.buffer = ReplayBuffer(buffer_size)  # off-policy

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()
        self.value_eval = model.v_net.to(device).train()

        self.value_target = self.copy_net(self.value_eval)

        self.actor_eval_optim = torch.optim.Adam(self.actor_eval.parameters(),
                                                 lr=policy_lr)
        self.critic_eval_optim = torch.optim.Adam(
            self.critic_eval.parameters(), lr=value_lr)
        self.value_eval_optim = torch.optim.Adam(self.value_eval.parameters(),
                                                 lr=value_lr)

        self.criterion = nn.SmoothL1Loss()

        self.alpha = alpha
        self.eps = np.finfo(np.float32).eps.item()
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0

    def learn(self):
        pg_loss, q_loss, v_loss = 0, 0, 0
        for _ in range(self.learn_iteration):
            batch = self.buffer.split_batch(self.batch_size)
            if self.act_dim is None:
                self.act_dim = np.array(batch['a']).shape[-1]

            S = torch.tensor(batch['s'], dtype=torch.float32, device=device)
            A = torch.tensor(batch['a'], dtype=torch.float32,
                             device=device).view(-1, self.act_dim)
            M = torch.tensor(batch['m'], dtype=torch.float32).view(-1, 1)
            R = torch.tensor(batch['r'], dtype=torch.float32).view(-1, 1)
            S_ = torch.tensor(batch['s_'], dtype=torch.float32, device=device)

            new_A, log_prob = self.actor_eval.evaluate(S)

            # V_value loss
            with torch.no_grad():
                new_q1_value, new_q2_value = self.critic_eval(S, new_A)
                next_value = torch.min(new_q1_value,
                                       new_q2_value) - self.alpha * log_prob
            value = self.value_eval(S)
            value_loss = self.criterion(value, next_value)

            # Soft q loss
            with torch.no_grad():
                target_value = self.value_target(S_)
                target_q_value = R + M * self._gamma * target_value.cpu()
                target_q_value = target_q_value.to(device)
            q1_value, q2_value = self.critic_eval(S, A)
            loss1 = self.criterion(q1_value, target_q_value)
            loss2 = self.criterion(q2_value, target_q_value)
            critic_loss = 0.5 * (loss1 + loss2)

            # update V
            self.value_eval_optim.zero_grad()
            value_loss.backward()
            self.value_eval_optim.step()

            # update soft Q
            self.critic_eval_optim.zero_grad()
            critic_loss.backward()
            self.critic_eval_optim.step()
            self._learn_critic_cnt += 1

            actor_loss = torch.tensor(0)
            # update policy
            if self._learn_critic_cnt % self.policy_freq == 0:
                # policy loss
                actor_loss = (self.alpha * log_prob -
                              torch.min(new_q1_value, new_q2_value)).mean()
                # actor_loss = (log_prob - torch.min(new_q1_value, new_q2_value).detach()).mean()

                self.actor_eval_optim.zero_grad()
                actor_loss.backward()
                self.actor_eval_optim.step()
                self._learn_actor_cnt += 1

                self.soft_sync_weight(self.value_target, self.value_eval,
                                      self.tau)

            pg_loss += actor_loss.item()
            q_loss += critic_loss.item()
            v_loss += value_loss.item()

            return pg_loss, q_loss, v_loss
Ejemplo n.º 14
0
class TD3(BasePolicy):
    def __init__(
        self,
        model,
        buffer_size=1000,
        actor_learn_freq=2,
        target_update_freq=1,
        target_update_tau=0.005,
        learning_rate=1e-4,
        discount_factor=0.99,
        batch_size=100,
        update_iteration=10,
        verbose=False,
        act_dim=None,
    ):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._update_iteration = update_iteration
        self._sync_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = verbose
        self._batch_size = batch_size
        self.buffer = ReplayBuffer(buffer_size)

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()

        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)

        self.actor_target = self.copy_net(self.actor_eval)
        self.critic_target = self.copy_net(self.critic_eval)

        self.criterion = nn.MSELoss()  # why mse?

        self.noise_clip = 0.5
        self.noise_std = 0.2

    def learn(self):
        loss_actor_avg, loss_critic_avg = 0, 0

        for _ in range(self._update_iteration):
            batch = self.buffer.split_batch(self._batch_size)
            if self.act_dim is None:
                self.act_dim = np.array(batch['a']).shape[-1]
            S = torch.tensor(batch['s'], dtype=torch.float32,
                             device=device)  # [batch_size, S.feature_size]
            A = torch.tensor(batch['a'], dtype=torch.float32,
                             device=device).view(-1, self.act_dim)
            M = torch.tensor(batch['m'], dtype=torch.float32).view(-1, 1)
            R = torch.tensor(batch['r'], dtype=torch.float32).view(-1, 1)
            S_ = torch.tensor(batch['s_'], dtype=torch.float32, device=device)
            if self._verbose:
                print(
                    f'Shape S:{S.shape}, A:{A.shape}, M:{M.shape}, R:{R.shape}, S_:{S_.shape}'
                )
            A_noise = self.actor_target.action(S_, self.noise_std,
                                               self.noise_clip)

            with torch.no_grad():
                q1_next, q2_next = self.critic_target.twinQ(S_, A_noise)
                q_next = torch.min(q1_next, q2_next)
                q_target = R + M * self._gamma * q_next.cpu()
                q_target = q_target.to(device)
            q1_eval, q2_eval = self.critic_eval.twinQ(S, A)
            loss1 = self.criterion(q1_eval, q_target)
            loss2 = self.criterion(q2_eval, q_target)
            critic_loss = 0.5 * (loss1 + loss2)

            self.critic_eval_optim.zero_grad()
            critic_loss.backward()
            self.critic_eval_optim.step()

            loss_critic_avg += critic_loss.item()
            self._learn_critic_cnt += 1
            if self._verbose:
                print(
                    f'=======Learn_Critic_Net, cnt{self._learn_critic_cnt}======='
                )

            if self._learn_critic_cnt % self.actor_learn_freq == 0:
                actor_loss = -self.critic_eval(S, self.actor_eval(S)).mean()

                self.actor_eval_optim.zero_grad()
                actor_loss.backward()
                self.actor_eval_optim.step()

                loss_actor_avg += actor_loss.item()
                self._learn_actor_cnt += 1
                if self._verbose:
                    print(
                        f'=======Learn_Actort_Net, cnt{self._learn_actor_cnt}======='
                    )

            if self._learn_critic_cnt % self.target_update_freq == 0:
                if self._verbose:
                    print(
                        f'=======Soft_sync_weight of TD3, tau{self.tau}======='
                    )
                self.soft_sync_weight(self.actor_target, self.actor_eval,
                                      self.tau)
                self.soft_sync_weight(self.critic_target, self.critic_eval,
                                      self.tau)

        loss_actor_avg /= (self._update_iteration / self.actor_learn_freq)
        loss_critic_avg /= self._update_iteration
        return loss_actor_avg, loss_critic_avg
Ejemplo n.º 15
0
class SAC2(BasePolicy):  # pg_net + q_net + alpha
    def __init__(
        self,
        model,
        buffer_size=1e6,
        batch_size=256,
        policy_freq=2,
        tau=0.005,
        discount=0.99,
        policy_lr=3e-4,
        value_lr=3e-4,
        learn_iteration=1,
        verbose=False,
        act_dim=None,
    ):
        super().__init__()
        self.tau = tau
        self.gamma = discount
        self.policy_freq = policy_freq
        self.learn_iteration = learn_iteration
        self.verbose = verbose
        self.act_dim = act_dim
        self.batch_size = batch_size
        self.buffer = ReplayBuffer(buffer_size)  # off-policy

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()

        self.actor_target = self.copy_net(self.actor_eval)
        self.critic_target = self.copy_net(self.critic_eval)

        self.actor_eval_optim = torch.optim.Adam(self.actor_eval.parameters(),
                                                 lr=policy_lr)
        self.critic_eval_optim = torch.optim.Adam(
            self.critic_eval.parameters(), lr=value_lr)

        self.criterion = nn.SmoothL1Loss()
        self.target_entropy = -torch.tensor(1).to(device)
        self.log_alpha = torch.zeros(1, requires_grad=True, device=device)
        self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=policy_lr)
        self.alpha = self.log_alpha.exp()

        self.eps = np.finfo(np.float32).eps.item()
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0

    def learn(self):
        pg_loss, q_loss, a_loss = 0, 0, 0
        for _ in range(self.learn_iteration):
            batch = self.buffer.split_batch(self.batch_size)

            if self.act_dim is None:
                self.act_dim = np.array(batch['a']).shape[-1]
                self.target_entropy = -torch.tensor(self.act_dim).to(device)

            S = torch.tensor(batch['s'], dtype=torch.float32, device=device)
            A = torch.tensor(batch['a'], dtype=torch.float32,
                             device=device).view(-1, self.act_dim)
            M = torch.tensor(batch['m'], dtype=torch.float32).view(-1, 1)
            R = torch.tensor(batch['r'], dtype=torch.float32).view(-1, 1)
            S_ = torch.tensor(batch['s_'], dtype=torch.float32, device=device)

            if self.verbose:
                print(
                    f'shape S:{S.size()}, A:{A.size()}, M:{M.size()}, R:{R.size()}, S_:{S_.size()}, W:{W.size()}'
                )

            with torch.no_grad():
                next_A, next_log = self.actor_target.evaluate(S_)
                q1_next, q2_next = self.critic_target(S_, next_A)
                q_next = torch.min(q1_next, q2_next) - self.alpha * next_log
                q_target = R + M * self.gamma * q_next.cpu()
                q_target = q_target.to(device)
            # q_loss
            q1_eval, q2_eval = self.critic_eval(S, A)
            loss1 = self.criterion(q1_eval, q_target)
            loss2 = self.criterion(q2_eval, q_target)
            critic_loss = 0.5 * (loss1 + loss2)

            self.critic_eval_optim.zero_grad()
            critic_loss.backward()
            self.critic_eval_optim.step()
            self._learn_critic_cnt += 1

            actor_loss = alpha_loss = torch.tensor(0)
            if self._learn_critic_cnt % self.policy_freq == 0:
                curr_A, curr_log = self.actor_eval.evaluate(S)
                q1_next, q2_next = self.critic_eval(S, curr_A)
                q_next = torch.min(q1_next, q2_next)

                # pg_loss
                actor_loss = (self.alpha * curr_log - q_next).mean()
                self.actor_eval_optim.zero_grad()
                actor_loss.backward()
                self.actor_eval_optim.step()
                self._learn_actor_cnt += 1

                # alpha loss
                alpha_loss = -(
                    self.log_alpha *
                    (curr_log + self.target_entropy).detach()).mean()
                self.alpha_optim.zero_grad()
                alpha_loss.backward()
                self.alpha_optim.step()
                self.alpha = float(self.log_alpha.exp().detach().cpu().numpy())

                self.soft_sync_weight(self.critic_target, self.critic_eval,
                                      self.tau)
                self.soft_sync_weight(self.actor_target, self.actor_eval,
                                      self.tau)

            q_loss += critic_loss.item()
            pg_loss += actor_loss.item()
            a_loss += alpha_loss.item()

        return pg_loss, q_loss, a_loss
Ejemplo n.º 16
0
    def __init__(
        self,
        model,
        action_dim=1,
        buffer_size=1000,
        batch_size=100,
        actor_learn_freq=1,
        target_update_freq=5,
        target_update_tau=0.1,
        # learning_rate=1e-3,
        actor_lr=1e-4,
        critic_lr=1e-3,
        discount_factor=0.99,
        verbose=False,
        update_iteration=10,
        use_priority=False,
        use_m=False,
        n_step=1,
    ):
        super().__init__()
        # self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._target = target_update_freq > 0
        self._update_iteration = update_iteration
        self._sync_cnt = 0
        # self._learn_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = verbose
        self._batch_size = batch_size
        self.use_priority = use_priority
        self.use_dist = model.value_net.use_dist
        self.use_munchausen = use_m
        self.n_step = n_step

        if self.use_priority:
            self.buffer = PriorityReplayBuffer(buffer_size, n_step=self.n_step)
        else:
            self.buffer = ReplayBuffer(buffer_size)  # off-policy

        if self.use_dist:
            assert model.value_net.num_atoms > 1
            # assert isinstance(model.value_net, CriticModelDist)
            self.v_min = model.value_net.v_min
            self.v_max = model.value_net.v_max
            self.num_atoms = model.value_net.num_atoms
            self.delta_z = (self.v_max - self.v_min) / (self.num_atoms - 1)
            self.support = torch.linspace(self.v_min, self.v_max,
                                          self.num_atoms)

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()

        self.actor_target = self.copy_net(self.actor_eval)
        self.critic_target = self.copy_net(self.critic_eval)

        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=actor_lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=critic_lr)

        self.criterion = nn.SmoothL1Loss(reduction='none')  # keep batch dim

        self.target_entropy = -torch.tensor(action_dim).to(device)
        self.log_alpha = torch.zeros(1, requires_grad=True, device=device)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=actor_lr)
        self.alpha = self.log_alpha.exp()
Ejemplo n.º 17
0
class DDPG(BasePolicy):
    def __init__(
        self,
        model,
        buffer_size=1000,
        actor_learn_freq=1,
        target_update_freq=1,
        target_update_tau=0.005,
        learning_rate=1e-4,
        discount_factor=0.99,
        batch_size=100,
        update_iteration=10,
        verbose=False,
        act_dim=None,
        num_episodes=1000,
    ):
        super().__init__()
        self.lr = learning_rate
        self.end_lr = learning_rate * 0.1
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._update_iteration = update_iteration
        self._sync_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = False
        self._batch_size = batch_size
        self.schedule_adam = True
        self.buffer = ReplayBuffer(buffer_size)

        self.actor_eval = model.policy_net.to(device).train()  # pi(s)
        self.critic_eval = model.value_net.to(device).train()  # Q(s, a)
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr)

        self.actor_target = self.copy_net(self.actor_eval)
        self.critic_target = self.copy_net(self.critic_eval)

        self.criterion = nn.MSELoss()  # why mse?
        self.act_dim = act_dim
        self.num_episodes = num_episodes

    def learn(self):
        loss_actor_avg, loss_critic_avg = 0, 0

        for _ in range(self._update_iteration):
            batch = self.buffer.split_batch(self._batch_size)
            if self.act_dim is None:
                self.act_dim = np.array(batch['a']).shape[-1]

            S = torch.tensor(batch['s'], dtype=torch.float32, device=device)  # [batch_size, state_dim]
            # print(batch['a'])
            A = torch.tensor(batch['a'], dtype=torch.float32, device=device).view(-1, self.act_dim)  # [batch_size, act_dim]
            M = torch.tensor(batch['m'], dtype=torch.float32).view(-1, 1) # [batch_size, 1]
            R = torch.tensor(batch['r'], dtype=torch.float32).view(-1, 1) # [batch_size, 1]
            S_ = torch.tensor(batch['s_'], dtype=torch.float32, device=device) # [batch_size, state_dim]
            if self._verbose:
                print(f'Shape S:{S.shape}, A:{A.shape}, M:{M.shape}, R:{R.shape}, S_:{S_.shape}')

            with torch.no_grad():
                q_next = self.critic_target(S_, self.actor_target(S_))
                q_target = R + M * self._gamma * q_next.cpu()
                q_target = q_target.to(device)
            q_eval = self.critic_eval(S, A)  # [batch_size, q_value_size]
            critic_loss = self.criterion(q_eval, q_target)

            self.critic_eval_optim.zero_grad()
            critic_loss.backward()
            self.critic_eval_optim.step()

            loss_critic_avg += critic_loss.item()
            self._learn_critic_cnt += 1
            if self._verbose:
                print(f'=======Learn_Critic_Net, cnt:{self._learn_critic_cnt}=======')

            if self._learn_critic_cnt % self.actor_learn_freq == 0:
                actor_loss = -self.critic_eval(S, self.actor_eval(S)).mean()

                self.actor_eval_optim.zero_grad()
                actor_loss.backward()
                self.actor_eval_optim.step()

                loss_actor_avg += actor_loss.item()
                self._learn_actor_cnt += 1
                if self._verbose:
                    print(f'=======Learn_Actort_Net, cnt:{self._learn_actor_cnt}=======')

            if self._learn_critic_cnt % self.target_update_freq == 0:
                if self._verbose:
                    print(f'=======Soft_sync_weight of DDPG, tau:{self.tau}=======')
                self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau)
                self.soft_sync_weight(self.actor_target, self.actor_eval, self.tau)

        if self.schedule_adam:

            new_lr = self.lr + (self.end_lr - self.lr) / self.num_episodes * self._learn_critic_cnt / self._update_iteration
            # set learning rate
            # ref: https://stackoverflow.com/questions/48324152/
            for g in self.actor_eval_optim.param_groups:
                g['lr'] = new_lr
            for g in self.critic_eval_optim.param_groups:
                g['lr'] = new_lr

        loss_actor_avg /= (self._update_iteration/self.actor_learn_freq)
        loss_critic_avg /= self._update_iteration
        return loss_actor_avg, loss_critic_avg
Ejemplo n.º 18
0
class OAC(BasePolicy):  # no value network
    def __init__(self,
                 model,
                 buffer_size=1000,
                 batch_size=100,
                 actor_learn_freq=1,
                 target_update_freq=5,
                 target_update_tau=0.01,
                 learning_rate=1e-3,
                 discount_factor=0.99,
                 verbose=False,
                 update_iteration=10,
                 act_dim=None):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._target = target_update_freq > 0
        self._update_iteration = update_iteration
        self._sync_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = verbose
        self._batch_size = batch_size
        self.buffer = ReplayBuffer(buffer_size)  # off-policy

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()

        self.actor_target = self.copy_net(self.actor_eval)
        self.critic_target = self.copy_net(self.critic_eval)

        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)

        self.criterion = nn.SmoothL1Loss()

        self.act_dim = act_dim
        self.target_entropy = -torch.tensor(1).to(device)
        self.log_alpha = torch.zeros(1, requires_grad=True, device=device)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=self.lr)
        self.alpha = self.log_alpha.exp()

    def choose_action(self, state, test=False, beta_UB=1.0, delta=1.0):
        # paper: Better Exploration with Optimistic Actor-Critic, NeurIPS 2019
        # pdf: https://arxiv.org/pdf/1910.12807.pdf
        # ref: https://github.com/microsoft/oac-explore/blob/master/optimistic_exploration.py
        # paper param: beta_UB=4.66 delta=23.53, env_name=humanoid
        state = torch.tensor(state, dtype=torch.float32, device=device)
        if test:
            self.actor_eval.eval()
            mean, log_std = self.actor_eval(state)
            return mean.detach().cpu().numpy()

        assert len(list(state.shape)) == 1  # not batch
        mu_T, log_std = self.actor_eval(state)
        std = torch.exp(log_std)
        # assert len(list(mu_T.shape)) == 1, mu_T
        # assert len(list(std.shape)) == 1
        mu_T.requires_grad_()
        curr_act = torch.tanh(mu_T).unsqueeze(0)  # action
        state = state.unsqueeze(0)

        q1, q2 = self.critic_target(state, curr_act)
        mu_q = (q1 + q2) / 2.0
        sigma_q = torch.abs(q1 - q2) / 2.0
        Q_UB = mu_q + beta_UB * sigma_q

        grad = torch.autograd.grad(Q_UB, mu_T)
        grad = grad[0]

        assert grad is not None
        assert mu_T.shape == grad.shape

        sigma_T = torch.pow(std, 2)
        denom = torch.sqrt(torch.sum(torch.mul(torch.pow(grad, 2),
                                               sigma_T))) + 10e-6

        mu_C = np.sqrt(2.0 * delta) * torch.mul(sigma_T, grad) / denom
        assert mu_C.shape == mu_T.shape
        mu_E = mu_T + mu_C
        assert mu_E.shape == std.shape

        normal = Normal(mu_E, std)
        z = normal.sample()
        action = torch.tanh(z).detach().cpu().numpy()
        return action

    def learn(self):
        pg_loss, q_loss, a_loss = 0, 0, 0
        for _ in range(self._update_iteration):
            batch = self.buffer.split_batch(self._batch_size)
            if self.act_dim is None:
                self.act_dim = np.array(batch['a']).shape[-1]
                self.target_entropy = -torch.tensor(self.act_dim).to(device)

            S = torch.tensor(batch['s'], dtype=torch.float32, device=device)
            A = torch.tensor(batch['a'], dtype=torch.float32,
                             device=device).view(-1, 1)
            M = torch.tensor(batch['m'], dtype=torch.float32).view(-1, 1)
            R = torch.tensor(batch['r'], dtype=torch.float32).view(-1, 1)
            S_ = torch.tensor(batch['s_'], dtype=torch.float32, device=device)

            # print (f'size S:{S.size()}, A:{A.size()}, M:{M.size()}, R:{R.size()}, S_:{S_.size()}, W:{W.size()}')
            with torch.no_grad():
                next_A, next_log = self.actor_target.evaluate(S_)
                q1_next, q2_next = self.critic_target(S_, next_A)
                q_next = torch.min(q1_next, q2_next) - self.alpha * next_log
                q_target = R + M * self._gamma * q_next.cpu()
                q_target = q_target.to(device)

            # q_loss
            q1_eval, q2_eval = self.critic_eval(S, A)
            critic_loss = self.criterion(q1_eval, q_target) + self.criterion(
                q2_eval, q_target)

            self.critic_eval_optim.zero_grad()
            critic_loss.backward()
            self.critic_eval_optim.step()
            self._learn_critic_cnt += 1

            actor_loss = torch.tensor(0)
            alpha_loss = torch.tensor(0)
            if self._learn_critic_cnt % self.actor_learn_freq == 0:
                curr_A, curr_log = self.actor_eval.evaluate(S)
                q1_next, q2_next = self.critic_eval(S, curr_A)
                q_next = torch.min(q1_next, q2_next)

                # pg_loss
                actor_loss = (self.alpha * curr_log - q_next).mean()
                self.actor_eval_optim.zero_grad()
                actor_loss.backward()
                self.actor_eval_optim.step()

                # alpha loss
                alpha_loss = -(
                    self.log_alpha *
                    (curr_log + self.target_entropy).detach()).mean()
                self.alpha_optim.zero_grad()
                alpha_loss.backward()
                self.alpha_optim.step()
                self.alpha = float(self.log_alpha.exp().detach().cpu().numpy())

            q_loss += critic_loss.item() * 0.5
            pg_loss += actor_loss.item()
            a_loss += alpha_loss.item()

            if self._learn_critic_cnt % self.target_update_freq:
                self.soft_sync_weight(self.critic_target, self.critic_eval,
                                      self.tau)
                self.soft_sync_weight(self.actor_target, self.actor_eval,
                                      self.tau)

        return pg_loss, q_loss, a_loss
Ejemplo n.º 19
0
class MSAC(BasePolicy):
    def __init__(
        self,
        model,
        action_dim=1,
        buffer_size=1000,
        batch_size=100,
        actor_learn_freq=1,
        target_update_freq=5,
        target_update_tau=0.1,
        # learning_rate=1e-3,
        actor_lr=1e-4,
        critic_lr=1e-3,
        discount_factor=0.99,
        verbose=False,
        update_iteration=10,
        use_priority=False,
        use_m=False,
        n_step=1,
    ):
        super().__init__()
        # self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._target = target_update_freq > 0
        self._update_iteration = update_iteration
        self._sync_cnt = 0
        # self._learn_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = verbose
        self._batch_size = batch_size
        self.use_priority = use_priority
        self.use_dist = model.value_net.use_dist
        self.use_munchausen = use_m
        self.n_step = n_step

        if self.use_priority:
            self.buffer = PriorityReplayBuffer(buffer_size, n_step=self.n_step)
        else:
            self.buffer = ReplayBuffer(buffer_size)  # off-policy

        if self.use_dist:
            assert model.value_net.num_atoms > 1
            # assert isinstance(model.value_net, CriticModelDist)
            self.v_min = model.value_net.v_min
            self.v_max = model.value_net.v_max
            self.num_atoms = model.value_net.num_atoms
            self.delta_z = (self.v_max - self.v_min) / (self.num_atoms - 1)
            self.support = torch.linspace(self.v_min, self.v_max,
                                          self.num_atoms)

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()

        self.actor_target = self.copy_net(self.actor_eval)
        self.critic_target = self.copy_net(self.critic_eval)

        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=actor_lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=critic_lr)

        self.criterion = nn.SmoothL1Loss(reduction='none')  # keep batch dim

        self.target_entropy = -torch.tensor(action_dim).to(device)
        self.log_alpha = torch.zeros(1, requires_grad=True, device=device)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=actor_lr)
        self.alpha = self.log_alpha.exp()

    def _tensor(self, data, use_cuda=False):
        if np.array(data).ndim == 1:
            data = torch.tensor(data, dtype=torch.float32).view(-1, 1)
        else:
            data = torch.tensor(data, dtype=torch.float32)
        if use_cuda:
            data = data.to(device)
        return data

    def learn_critic_dist(self, obs, act, rew, next_obs, mask):
        with torch.no_grad():
            next_act, next_log_pi = self.actor_target(next_obs)
            # q(s, a) change to z(s, a) to discribe a distributional
            p1_next, p2_next = self.critic_target.get_probs(
                next_obs, next_act)  # [batch_size, num_atoms]
            p_next = torch.stack([
                torch.where(z1.sum() < z2.sum(), z1, z2)
                for z1, z2 in zip(p1_next, p2_next)
            ])
            p_next -= (self.alpha * next_log_pi)
            Tz = rew.unsqueeze(1) + mask * self.support.unsqueeze(0)
            Tz = Tz.clamp(min=self.v_min, max=self.v_max)

            b = (Tz - self.v_min) / self.delta_z
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.num_atoms - 1)) * (l == u)] += 1

            m = obs.new_zeros(self._batch_size, self.num_atoms).cpu()
            p_next = p_next.cpu()
            # print (f'm device: {m.device}')
            # print (f'p_next device: {p_next.device}')
            offset = torch.linspace(0,
                                    ((self._batch_size - 1) * self.num_atoms),
                                    self._batch_size).unsqueeze(1).expand(
                                        self._batch_size, self.num_atoms).to(l)
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (p_next *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (p_next *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        m = m.to(device)
        log_z1, log_z2 = self.critic_eval.get_probs(obs, act, log=True)
        loss1 = -(m * log_z1).sum(dim=1)
        loss2 = -(m * log_z2).sum(dim=1)
        return 0.5 * (loss1 + loss2)

    def learn_critic(self, obs, act, rew, next_obs, mask):
        with torch.no_grad():
            next_A, next_log = self.actor_target.evaluate(next_obs)
            # print (f'nextA shape is {next_A.shape}')
            q1_next, q2_next = self.critic_target.twinQ(next_obs, next_A)
            # print (f'shape q1 {q1_next.shape}, q2 {q2_next.shape}, next_obs {next_obs.shape}, next_A {next_A.shape}')
            # print (f'q1_next shape is {q1_next.shape}')
            # q_next = torch.stack([torch.where(q1.sum() < q2.sum(), q1, q2) for q1, q2 in zip(q1_next, q2_next)])
            # print (f'shape stack q_next {q_next.shape} ')
            q_next = torch.min(q1_next, q2_next) - self.alpha * next_log
            # print (f'q_next shape is {q_next.shape}')
            # print(f'shpae rew {rew.shape}, mask {mask.shape}, q_next {q_next.shape}')
            q_target = rew + mask * self._gamma * q_next.cpu()
            if self.use_priority:
                q_target = rew + mask * (self._gamma**
                                         self.n_step) * q_next.cpu()
            # print (f'q_target shape is {q_target.shape}')
            q_target = q_target.to(device)

        # q_loss
        q1_eval, q2_eval = self.critic_eval.twinQ(obs, act)
        criterion = nn.SmoothL1Loss(reduction='none')
        # print (f'q1_eval shape is {q1_eval.shape}')
        loss1 = criterion(q1_eval, q_target)
        loss2 = criterion(q2_eval, q_target)
        return 0.5 * (loss1 + loss2)

    def learn_actor_dist(self, obs):
        curr_act, curr_log = self.actor_eval.evaluate(obs)
        p1_next, p2_next = self.critic_eval.get_probs(obs, curr_act)
        p_next = torch.stack([
            torch.where(p1.sum() < p2.sum(), p1, p2)
            for p1, p2 in zip(p1_next, p2_next)
        ])
        num_atoms = torch.tensor(self.num_atoms,
                                 dtype=torch.float32,
                                 device=device)
        # actor_loss = p_next * num_atoms
        # actor_loss = torch.sum(actor_loss, dim=1)
        # actor_loss = -(actor_loss + self.alpha * curr_log).mean()
        actor_loss = (self.alpha * curr_log - p_next)
        actor_loss = torch.sum(actor_loss, dim=1)
        actor_loss = actor_loss.mean()
        return actor_loss, curr_log

    def learn_actor(self, obs):
        curr_act, curr_log = self.actor_eval.evaluate(obs)
        q1_next, q2_next = self.critic_eval.twinQ(obs, curr_act)
        q_next = torch.min(q1_next, q2_next)
        actor_loss = (self.alpha * curr_log - q_next).mean()
        return actor_loss, curr_log

    def get_munchausen_rew(self, obs, act, rew):
        self.m_alpha = 0.9
        self.m_tau = 0.03
        self.lo = -1
        mu, log_std = self.actor_eval(obs)
        std = log_std.exp()
        dist = Normal(mu, std)
        log_pi_a = self.m_tau * dist.log_prob(act).mean(1).unsqueeze(1).cpu()
        m_rew = rew + self.m_alpha * torch.clamp(log_pi_a, min=self.lo, max=0)
        return m_rew

    def learn(self):
        pg_loss, q_loss, a_loss = 0, 0, 0
        for _ in range(self._update_iteration):
            if self.use_priority:
                S, A, R, S_, M, indices, weights = self.buffer.sample(
                    self._batch_size)
                W = torch.tensor(weights, dtype=torch.float32,
                                 device=device).view(-1, 1)
            else:
                batch_split = self.buffer.split_batch(self._batch_size)
                S, A, M, R, S_ = batch_split['s'], batch_split[
                    'a'], batch_split['m'], batch_split['r'], batch_split['s_']
            # print ('after sampling from buffer!')
            R = torch.tensor(R, dtype=torch.float32).view(-1, 1)
            S = torch.tensor(S, dtype=torch.float32, device=device)
            # A = torch.tensor(A, dtype=torch.float32, device=device).view(-1, 1)
            A = torch.tensor(A, dtype=torch.float32, device=device).squeeze(1)
            # print (f'A shape {A.shape}')
            M = torch.tensor(M, dtype=torch.float32).view(-1, 1)
            S_ = torch.tensor(S_, dtype=torch.float32, device=device)

            # self.use_munchausen = True
            if self.use_munchausen:
                R = self.get_munchausen_rew(S, A, R)
            # print (f'shape S:{S.shape}, A:{A.shape}, M:{M.shape}, R:{R.shape}, S_:{S_.shape}')
            if self.use_dist:
                # D = torch.from_numpy(np.array([1^int(mask.item()) for mask in M])).view(-1, 1)
                # print (f'size S:{S.shape}, A:{A.shape}, M:{M.shape}, R:{R.shape}, S_:{S_.shape}, D:{D.shape}')
                # assert 0
                batch_loss = self.learn_critic_dist(S, A, R, S_, M)
            else:
                batch_loss = self.learn_critic(S, A, R, S_, M)

            if self.use_priority:
                critic_loss = (W * batch_loss).mean()
                td_errors = batch_loss.detach().cpu().numpy().sum(1)
                # print(batch_loss)
                # print(td_errors)
                self.buffer.update_priorities(indices,
                                              np.abs(td_errors) + 1e-6)
            else:
                critic_loss = batch_loss.mean()

            self.critic_eval_optim.zero_grad()
            critic_loss.backward()
            self.critic_eval_optim.step()
            self._learn_critic_cnt += 1

            actor_loss = torch.tensor(0)
            alpha_loss = torch.tensor(0)
            if self._learn_critic_cnt % self.actor_learn_freq == 0:
                if self.use_dist:
                    actor_loss, curr_log = self.learn_actor_dist(S)
                else:
                    actor_loss, curr_log = self.learn_actor(S)

                self.actor_eval_optim.zero_grad()
                actor_loss.backward()
                self.actor_eval_optim.step()

                # alpha loss
                alpha_loss = -(
                    self.log_alpha *
                    (curr_log + self.target_entropy).detach()).mean()
                self.alpha_optim.zero_grad()
                alpha_loss.backward()
                self.alpha_optim.step()
                self.alpha = float(self.log_alpha.exp().detach().cpu().numpy())

            q_loss += critic_loss.item()
            pg_loss += actor_loss.item()
            a_loss += alpha_loss.item()

            if self._learn_critic_cnt % self.target_update_freq:
                self.soft_sync_weight(self.critic_target, self.critic_eval,
                                      self.tau)
                self.soft_sync_weight(self.actor_target, self.actor_eval,
                                      self.tau)

        return pg_loss, q_loss, a_loss
Ejemplo n.º 20
0
class Rainbow(
        BasePolicy
):  #option: double(done), dueling(todo), noisy(todo), n-step(todo),
    def __init__(self,
                 critic_net,
                 action_shape=0,
                 buffer_size=1000,
                 batch_size=100,
                 target_update_freq=1,
                 target_update_tau=1,
                 learning_rate=0.01,
                 discount_factor=0.99,
                 verbose=False):
        super().__init__()
        self.lr = learning_rate
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau
        self.epsilon = 0.5
        # ticks
        self.double_q = True
        self.dueling_q = True
        self.distributional_q = True
        self.prioritized_replay = True
        self.noisy_q = True
        self.n_step_td = True

        self.target_update_freq = target_update_freq
        self.action_shape = action_shape
        self._gamma = discount_factor
        self._batch_size = batch_size
        self._verbose = verbose
        self._update_iteration = 10
        self._learn_cnt = 0
        self._normalized = lambda x, e: (x - x.mean()) / (x.std() + e)
        self.rew_norm = True
        self.buffer = ReplayBuffer(buffer_size)

        self.critic_eval = critic_net.to(self.device)
        self.critic_target = deepcopy(self.critic_eval)
        self.critic_target.load_state_dict(self.critic_eval.state_dict())
        self.critic_eval.use_dueling = self.critic_target.use_dueling = self.dueling_q  # Dueling DQN

        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)
        self.critic_eval.train()

        self.criterion = nn.MSELoss()

        self.random_choose = 0
        self.sum_choose = 0

    def choose_action(self, state, test=False):
        state = torch.tensor(state, dtype=torch.float32,
                             device=self.device).unsqueeze(0)
        q_values = self.critic_eval(state)
        action = q_values.argmax(dim=1).cpu().data.numpy()
        action = action[0] if self.action_shape == 0 else action.reshape(
            self.action_shape)  # return the argmax index

        if test: self.epsilon = 1.0
        if np.random.randn() >= self.epsilon:  # epsilon-greedy
            self.random_choose += 1
            action = np.random.randint(0, q_values.size()[-1])
            action = action if self.action_shape == 0 else action.reshape(
                self.action_shape)

        self.sum_choose += 1
        return action

    def learn(self):
        for _ in range(self._update_iteration):
            batch_split = self.buffer.split_batch(
                self._batch_size)  # s, a, r, s_
            S = torch.tensor(batch_split['s'],
                             dtype=torch.float32,
                             device=self.device)
            A = torch.tensor(batch_split['a'],
                             dtype=torch.float32,
                             device=self.device).view(-1, 1)
            M = torch.tensor(batch_split['m'], dtype=torch.float32).view(-1, 1)
            R = torch.tensor(batch_split['r'], dtype=torch.float32).view(-1, 1)
            S_ = torch.tensor(batch_split['s_'],
                              dtype=torch.float32,
                              device=self.device)
            # print (f'SIZE S {S.size()}, A {A.size()}, M {M.size()}, R {R.size()}, S_ {S_.size()}')
            if self.rew_norm: R = self._normalized(R, self.eps)

            with torch.no_grad():
                get_action_net = self.critic_eval if self.double_q else self.critic_target  # Double DQN
                argmax_action = get_action_net(S_).max(dim=1, keepdim=True)[1]
                q_next = self.critic_target(S_).gather(
                    1, argmax_action.type(torch.long))
                q_target = R + M * self._gamma * q_next.cpu()
                q_target = q_target.to(self.device)

            q_eval = self.critic_eval(S).gather(1, A.type(torch.long))

            critic_loss = self.criterion(q_eval, q_target)
            self.critic_eval_optim.zero_grad()
            critic_loss.backward()
            self.critic_eval_optim.step()

            self._learn_cnt += 1
            if self._learn_cnt % self.target_update_freq == 0:
                if self._verbose:
                    print(f'=======Soft_sync_weight of DQN=======')
                self.soft_sync_weight(self.critic_target, self.critic_eval,
                                      self.tau)
Ejemplo n.º 21
0
class PPO(BasePolicy):  # option: double
    def __init__(
            self,
            model,
            buffer_size=1000,
            actor_learn_freq=1,
            target_update_freq=0,
            target_update_tau=5e-3,
            learning_rate=0.0001,
            discount_factor=0.99,
            gae_lamda=0.95,  # td
            batch_size=100,
            verbose=False):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau
        self.ratio_clip = 0.2
        self.lam_entropy = 0.01
        self.adv_norm = False  # normalize advantage, defalut=False
        self.rew_norm = False  # normalize reward, default=False
        self.schedule_clip = False
        self.schedule_adam = False

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._gae_lam = gae_lamda
        self._target = target_update_freq > 0
        self._update_iteration = 10
        self._sync_cnt = 0
        # self._learn_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0

        self._verbose = verbose
        self._batch_size = batch_size
        self._normalized = lambda x, e: (x - x.mean()) / (x.std() + e)
        self.buffer = ReplayBuffer(buffer_size, replay=False)

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)

        # self.actor_eval.train()
        # self.critic_eval.train()

        if self._target:
            self.actor_target = self.copy_net(self.actor_eval)
            self.critic_target = self.copy_net(self.critic_eval)

        self.criterion = nn.SmoothL1Loss()

    def learn(self, i_episode=0, num_episode=100):
        if not self.buffer.is_full():
            print(
                f'Waiting for a full buffer: {len(self.buffer)}\{self.buffer.capacity()} ',
                end='\r')
            return 0, 0

        loss_actor_avg = 0
        loss_critic_avg = 0

        memory_split = self.buffer.split(self.buffer.all_memory())
        S = torch.tensor(memory_split['s'], dtype=torch.float32, device=device)
        A = torch.tensor(memory_split['a'], dtype=torch.float32,
                         device=device).view(-1, 1)
        S_ = torch.tensor(memory_split['s_'],
                          dtype=torch.float32,
                          device=device)
        R = torch.tensor(memory_split['r'], dtype=torch.float32).view(-1, 1)
        Log = torch.tensor(memory_split['l'],
                           dtype=torch.float32,
                           device=device).view(-1, 1)

        # print (f'Size S {S.size()}, A {A.size()}, S_ {S_.size()}, R {R.size()}, Log {Log.size()}')
        # print (f'S {S}, A {A}, S_ {S_}, R {R}, Log {Log}')
        with torch.no_grad():
            v_evals = self.critic_eval(S).cpu().numpy()
            end_v_eval = self.critic_eval(S_[-1]).cpu().numpy()

        rewards = self._normalized(
            R, self.eps).numpy() if self.rew_norm else R.numpy()
        # rewards = rewards.cpu().numpy()
        adv_gae_td = self.GAE(rewards,
                              v_evals,
                              next_v_eval=end_v_eval,
                              gamma=self._gamma,
                              lam=self._gae_lam)  # td_error adv
        advantage = torch.from_numpy(adv_gae_td).to(device).unsqueeze(-1)
        advantage = self._normalized(advantage,
                                     1e-10) if self.adv_norm else advantage

        # indices = [i for i in range(len(self.buffer))]
        for _ in range(self._update_iteration):
            v_eval = self.critic_eval(S)
            v_target = advantage + v_eval.detach()

            critic_loss = self.criterion(v_eval, v_target)
            loss_critic_avg += critic_loss.item()

            self.critic_eval_optim.zero_grad()
            critic_loss.backward()
            self.critic_eval_optim.step()
            self._learn_critic_cnt += 1

            if self._learn_critic_cnt % self.actor_learn_freq == 0:
                # actor_core
                mu, sigma = self.actor_eval(S)
                dist = Normal(mu, sigma)
                new_log_prob = dist.log_prob(A)

                pg_ratio = torch.exp(new_log_prob -
                                     Log)  # size = [batch_size, 1]
                clipped_pg_ratio = torch.clamp(pg_ratio, 1.0 - self.ratio_clip,
                                               1.0 + self.ratio_clip)
                surrogate_loss = -torch.min(
                    pg_ratio * advantage, clipped_pg_ratio * advantage).mean()

                # policy entropy
                loss_entropy = -torch.mean(
                    torch.exp(new_log_prob) * new_log_prob)

                actor_loss = surrogate_loss - self.lam_entropy * loss_entropy

                loss_actor_avg += actor_loss.item()

                self.actor_eval_optim.zero_grad()
                actor_loss.backward()
                self.actor_eval_optim.step()
                self._learn_actor_cnt += 1
                if self._verbose:
                    print(f'=======Learn_Actort_Net=======')

            if self._target:
                if self._learn_critic_cnt % self.target_update_freq == 0:
                    if self._verbose:
                        print(f'=======Soft_sync_weight of DDPG=======')
                    self.soft_sync_weight(self.critic_target, self.critic_eval,
                                          self.tau)
                    self.soft_sync_weight(self.actor_target, self.actor_eval,
                                          self.tau)

        self.buffer.clear()
        assert self.buffer.is_empty()

        # update param
        ep_ratio = 1 - (i_episode / num_episode)
        if self.schedule_clip:
            self.ratio_clip = 0.2 * ep_ratio

        if self.schedule_adam:
            new_lr = self.lr * ep_ratio
            # set learning rate
            # ref: https://stackoverflow.com/questions/48324152/
            for g in self.actor_eval_optim.param_groups:
                g['lr'] = new_lr
            for g in self.critic_eval_optim.param_groups:
                g['lr'] = new_lr

        print(
            f'critic_cnt {self._learn_critic_cnt}, actor_cnt {self._learn_actor_cnt}'
        )
        loss_actor_avg /= (self._update_iteration / self.actor_learn_freq)
        loss_critic_avg /= self._update_iteration

        return loss_actor_avg, loss_critic_avg
Ejemplo n.º 22
0
class A2CPolicy(BasePolicy):  #option: double
    def __init__(self,
                 actor_net,
                 critic_net,
                 buffer_size=1000,
                 actor_learn_freq=1,
                 target_update_freq=0,
                 target_update_tau=5e-3,
                 learning_rate=0.01,
                 discount_factor=0.99,
                 gae_lamda=1,
                 verbose=False):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau
        self.gae_lamda = gae_lamda

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._target = target_update_freq > 0
        self._sync_cnt = 0
        # self._learn_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = verbose
        self.buffer = ReplayBuffer(buffer_size, replay=False)
        # assert not self.buffer.allow_replay, 'PPO buffer cannot be replay buffer'

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.actor_eval = actor_net.to(self.device)
        self.critic_eval = critic_net.to(self.device)
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)

        self.actor_eval.train()
        self.critic_eval.train()

        if self._target:
            self.actor_target = deepcopy(self.actor_eval)
            self.critic_target = deepcopy(self.critic_eval)
            self.actor_target.load_state_dict(self.actor_eval.state_dict())
            self.critic_target.load_state_dict(self.critic_eval.state_dict())

            self.actor_target.eval()
            self.critic_target.eval()

        self.criterion = nn.SmoothL1Loss()

    def choose_action(self, state, test=False):
        state = torch.tensor(state, dtype=torch.float32, device=self.device)
        if test:
            self.actor_eval.eval()
            return Categorical(self.actor_eval(state)).sample().item(), 0

        dist = self.actor_eval(state)
        m = Categorical(dist)
        action = m.sample()
        log_prob = m.log_prob(action)
        state_value = self.critic_eval(state)

        return action.item(), log_prob

    def learn(self):
        memory_split = self.buffer.split(
            self.buffer.all_memory())  # s, r, l, m
        S = torch.tensor(memory_split['s'],
                         dtype=torch.float32,
                         device=self.device)
        R = torch.tensor(memory_split['r'], dtype=torch.float32).view(-1, 1)
        M = torch.tensor(memory_split['m'], dtype=torch.float32).view(-1, 1)
        Log = torch.stack(memory_split['l']).view(-1, 1)

        v_eval = self.critic_eval(S)

        v_evals = v_eval.detach().cpu().numpy()
        rewards = R.numpy()
        masks = M.numpy()
        adv_gae_mc = self.GAE(rewards,
                              v_evals,
                              next_v_eval=0,
                              masks=masks,
                              gamma=self._gamma,
                              lam=self.gae_lamda)  # MC adv
        advantage = torch.from_numpy(adv_gae_mc).to(self.device).reshape(-1, 1)

        v_target = advantage + v_eval.detach()
        # critic_core
        critic_loss = self.criterion(v_eval, v_target)
        self.critic_eval_optim.zero_grad()
        critic_loss.backward()
        self.critic_eval_optim.step()
        self._learn_critic_cnt += 1

        if self._learn_critic_cnt % self.actor_learn_freq == 0:
            # actor_core
            actor_loss = (-Log * advantage).sum()
            self.actor_eval.train()
            self.actor_eval_optim.zero_grad()
            actor_loss.backward()
            self.actor_eval_optim.step()
            self._learn_actor_cnt += 1

        if self._target:
            if self._learn_critic_cnt % self.target_update_freq == 0:
                if self._verbose:
                    print(f'=======Soft_sync_weight of AC=======')
                self.soft_sync_weight(self.critic_target, self.critic_eval,
                                      self.tau)
                self.soft_sync_weight(self.actor_target, self.actor_eval,
                                      self.tau)
                self._sync_cnt += 1

        self.buffer.clear()
        assert self.buffer.is_empty()

    def process(self, **kwargs):
        self.buffer.append(**kwargs)
Ejemplo n.º 23
0
    def __init__(
        self,
        model,
        buffer_size=1e6,
        batch_size=256,
        policy_freq=2,
        tau=0.005,
        discount=0.99,
        policy_lr=3e-4,
        value_lr=3e-4,
        learn_iteration=1,
        verbose=False,
        act_dim=None,
        n_step=1,
        use_munchausen=False,
        use_priority=False,
        use_dist_q=False,
        use_PAL=False,
    ):
        super().__init__()
        self.tau = tau
        self.gamma = discount
        self.policy_freq = policy_freq
        self.learn_iteration = learn_iteration
        self.verbose = verbose
        self.act_dim = act_dim
        self.batch_size = batch_size

        self.use_dist_q = use_dist_q
        self.use_priority = use_priority
        self.use_munchausen = use_munchausen
        self.use_PAL = use_PAL

        assert not (self.use_priority and self.use_PAL)

        self.buffer = ReplayBuffer(buffer_size)
        if self.use_priority:
            self.buffer = PriorityReplayBuffer(buffer_size,
                                               gamma=discount,
                                               n_step=n_step)

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()

        self.actor_target = self.copy_net(self.actor_eval)
        self.critic_target = self.copy_net(self.critic_eval)

        self.actor_eval_optim = torch.optim.Adam(self.actor_eval.parameters(),
                                                 lr=policy_lr)
        self.critic_eval_optim = torch.optim.Adam(
            self.critic_eval.parameters(), lr=value_lr)

        self.criterion = nn.SmoothL1Loss(reduction='none')  # keep batch dim
        self.target_entropy = -torch.tensor(1).to(device)
        self.log_alpha = torch.zeros(1, requires_grad=True, device=device)
        self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=policy_lr)
        self.alpha = self.log_alpha.exp()

        self.eps = np.finfo(np.float32).eps.item()
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
Ejemplo n.º 24
0
class A2C(BasePolicy):  #option: double
    def __init__(
        self,
        model,
        buffer_size=1000,
        learning_rate=1e-3,
        discount_factor=0.99,
        gae_lamda=1,  # mc
        verbose=False,
        num_episodes=1000,
    ):
        super().__init__()
        self.lr = learning_rate
        self.end_lr = self.lr * 0.1
        self.eps = np.finfo(np.float32).eps.item()

        self._gamma = discount_factor
        self._gae_lamda = gae_lamda  # default: 1, MC
        self._learn_cnt = 0
        self._verbose = verbose
        self.schedule_adam = True
        self.buffer = ReplayBuffer(buffer_size, replay=False)

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)

        self.criterion = nn.SmoothL1Loss()
        self.num_episodes = num_episodes

    def learn(self):
        pg_loss, v_loss = 0, 0
        mem = self.buffer.split(self.buffer.all_memory())  # s, r, l, m
        S = torch.tensor(mem['s'], dtype=torch.float32, device=device)
        R = torch.tensor(mem['r'], dtype=torch.float32).view(-1, 1)
        M = torch.tensor(mem['m'], dtype=torch.float32).view(-1, 1)
        # Log = torch.stack(list(mem['l'])).view(-1, 1)
        Log = torch.stack(mem['l']).view(-1, 1)

        v_eval = self.critic_eval(S)

        v_evals = v_eval.detach().cpu().numpy()
        rewards = R.numpy()
        masks = M.numpy()
        adv_gae_mc = self.GAE(rewards,
                              v_evals,
                              next_v_eval=0,
                              masks=masks,
                              gamma=self._gamma,
                              lam=self._gae_lamda)  # MC adv
        advantage = torch.from_numpy(adv_gae_mc).to(device).reshape(-1, 1)

        # critic_core
        v_target = advantage + v_eval.detach()
        critic_loss = self.criterion(v_eval, v_target)
        # actor_core
        actor_loss = (-Log * advantage).sum()

        self.critic_eval_optim.zero_grad()
        critic_loss.backward()
        self.critic_eval_optim.step()

        self.actor_eval_optim.zero_grad()
        actor_loss.backward()
        self.actor_eval_optim.step()

        v_loss += critic_loss.item()
        pg_loss += actor_loss.item()
        self._learn_cnt += 1

        self.buffer.clear()
        assert self.buffer.is_empty()

        if self.schedule_adam:
            new_lr = self.lr + (self.end_lr -
                                self.lr) / self.num_episodes * self._learn_cnt
            # set learning rate
            # ref: https://stackoverflow.com/questions/48324152/
            for g in self.actor_eval_optim.param_groups:
                g['lr'] = new_lr
            for g in self.critic_eval_optim.param_groups:
                g['lr'] = new_lr
        return pg_loss, v_loss
Ejemplo n.º 25
0
class DDPG(BasePolicy):
    def __init__(
        self,
        model,
        buffer_size=1000,
        actor_learn_freq=1,
        target_update_freq=0,
        target_update_tau=1,
        learning_rate=1e-3,
        discount_factor=0.99,
        batch_size=100,
        update_iteration=10,
        verbose=False,
    ):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._target = target_update_freq > 0
        self._update_iteration = update_iteration
        self._sync_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = verbose
        self._batch_size = batch_size
        self.buffer = ReplayBuffer(buffer_size)

        self.actor_eval = model.policy_net.to(device).train()  # pi(s)
        self.critic_eval = model.value_net.to(device).train()  # Q(s, a)
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)

        if self._target:
            self.actor_target = self.copy_net(self.actor_eval)
            self.critic_target = self.copy_net(self.critic_eval)

        self.criterion = nn.MSELoss()  # why mse?

    def learn(self):
        loss_actor_avg, loss_critic_avg = 0, 0

        for _ in range(self._update_iteration):
            batch_split = self.buffer.split_batch(self._batch_size)
            S = torch.tensor(batch_split['s'],
                             dtype=torch.float32,
                             device=device)  # [batch_size, S.feature_size]
            A = torch.tensor(batch_split['a'],
                             dtype=torch.float32,
                             device=device).view(-1, 1)  # [batch_size, 1]
            M = torch.tensor(batch_split['m'], dtype=torch.float32).view(-1, 1)
            R = torch.tensor(batch_split['r'], dtype=torch.float32).view(-1, 1)
            S_ = torch.tensor(batch_split['s_'],
                              dtype=torch.float32,
                              device=device)

            with torch.no_grad():
                q_next = self.critic_eval(S_, self.actor_eval(S_))
                if self._target:
                    q_next = self.critic_target(S_, self.actor_target(S_))
                q_target = R + M * self._gamma * q_next.cpu()
                q_target = q_target.to(device)
            # print (f'SIZE S {S.size()}, A {A.size()}')
            q_eval = self.critic_eval(S, A)  # [batch_size, q_value_size]
            critic_loss = self.criterion(q_eval, q_target)
            loss_critic_avg += critic_loss.item()

            self.critic_eval_optim.zero_grad()
            critic_loss.backward()
            self.critic_eval_optim.step()
            self._learn_critic_cnt += 1

            if self._learn_critic_cnt % self.actor_learn_freq == 0:
                actor_loss = -self.critic_eval(S, self.actor_eval(S)).mean()
                loss_actor_avg += actor_loss.item()

                self.actor_eval_optim.zero_grad()
                actor_loss.backward()
                self.actor_eval_optim.step()
                self._learn_actor_cnt += 1
                if self._verbose:
                    print(f'=======Learn_Actort_Net=======')

            if self._target:
                if self._learn_critic_cnt % self.target_update_freq == 0:
                    if self._verbose:
                        print(f'=======Soft_sync_weight of DDPG=======')
                    self.soft_sync_weight(self.critic_target, self.critic_eval,
                                          self.tau)
                    self.soft_sync_weight(self.actor_target, self.actor_eval,
                                          self.tau)

        loss_actor_avg /= (self._update_iteration / self.actor_learn_freq)
        loss_critic_avg /= self._update_iteration
        return loss_actor_avg, loss_critic_avg
Ejemplo n.º 26
0
class SACV(BasePolicy):
    def __init__(
        self,
        model,
        buffer_size=1000,
        batch_size=100,
        actor_learn_freq=1,
        target_update_freq=5,
        target_update_tau=1e-2,
        learning_rate=1e-3,
        discount_factor=0.99,
        update_iteration=10,
        verbose=False,
        use_priority=False,
        act_dim=None,
    ):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._target = target_update_freq > 0
        self._update_iteration = update_iteration
        self._sync_cnt = 0
        # self._learn_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = verbose
        self._batch_size = batch_size

        self.use_priority = use_priority
        self.use_dist = model.value_net.use_dist

        if self.use_priority:
            self.buffer = PriorityReplayBuffer(buffer_size)
        else:
            self.buffer = ReplayBuffer(buffer_size)  # off-policy

        if self.use_dist:
            assert model.value_net.num_atoms > 1
            # assert isinstance(model.value_net, CriticModelDist)
            self.v_min = model.value_net.v_min
            self.v_max = model.value_net.v_max
            self.num_atoms = model.value_net.num_atoms
            self.delta_z = (self.v_max - self.v_min) / (self.num_atoms - 1)
            self.support = torch.linspace(self.v_min, self.v_max,
                                          self.num_atoms)

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()

        self.actor_target = self.copy_net(self.actor_eval)
        self.critic_target = self.copy_net(self.critic_eval)

        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)

        self.criterion = nn.SmoothL1Loss(reduction='none')  # keep batch dim
        self.act_dim = act_dim

        self.target_entropy = -torch.tensor(1).to(device)
        self.log_alpha = torch.zeros(1, requires_grad=True, device=device)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=self.lr)
        self.alpha = self.log_alpha.exp()

    def _tensor(self, data, use_cuda=False):
        if np.array(data).ndim == 1:
            data = torch.tensor(data, dtype=torch.float32).view(-1, 1)
        else:
            data = torch.tensor(data, dtype=torch.float32)
        if use_cuda:
            data = data.to(device)
        return data

    def learn_dist(self, obs, act, rew, next_obs, mask):
        with torch.no_grad():
            next_act, next_log_pi = self.actor_target(next_obs)
            # q(s, a) change to z(s, a) to discribe a distributional
            z1_next, z2_next = self.critic_target.get_probs(
                next_obs, next_act)  # [batch_size, num_atoms]
            p_next = torch.stack([
                torch.where(z1.sum() < z2.sum(), z1, z2)
                for z1, z2 in zip(z1_next, z2_next)
            ])
            p_next -= (self.alpha * next_log_pi)
            Tz = rew.unsqueeze(1) + mask * self.support.unsqueeze(0)
            Tz = Tz.clamp(min=self.v_min, max=self.v_max)

            b = (Tz - self.v_min) / self.delta_z
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.num_atoms - 1)) * (l == u)] += 1

            m = obs.new_zeros(self._batch_size, self.num_atoms).cpu()
            p_next = p_next.cpu()
            # print (f'm device: {m.device}')
            # print (f'p_next device: {p_next.device}')
            offset = torch.linspace(0,
                                    ((self._batch_size - 1) * self.num_atoms),
                                    self._batch_size).unsqueeze(1).expand(
                                        self._batch_size, self.num_atoms).to(l)
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (p_next *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (p_next *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        m = m.to(device)
        log_z1, log_z2 = self.critic_eval.get_probs(obs, act, log=True)
        loss1 = -(m * log_z1).sum(dim=1)
        loss2 = -(m * log_z2).sum(dim=1)
        batch_loss = 0.5 * (loss1 + loss2)
        return batch_loss

    def learn(self):
        pg_loss, q_loss, a_loss = 0, 0, 0
        for _ in range(self._update_iteration):
            if self.use_priority:
                # s_{t}, n-step_rewards, s_{t+n}
                tree_idxs, S, A, R, S_, M, weights = self.buffer.sample(
                    self._batch_size)
                W = torch.tensor(weights, dtype=torch.float32,
                                 device=device).view(-1, 1)
            else:
                batch_split = self.buffer.split_batch(self._batch_size)
                S, A, M, R, S_ = batch_split['s'], batch_split[
                    'a'], batch_split['m'], batch_split['r'], batch_split['s_']
            # print ('after sampling from buffer!')
            if self.act_dim is None:
                self.act_dim = A.shape[-1]
                self.target_entropy = -torch.tensor(self.act_dim).to(device)
                print(self.target_entropy)
                assert 0

            R = torch.tensor(R, dtype=torch.float32).view(-1, 1)
            S = torch.tensor(S, dtype=torch.float32, device=device)
            # A = torch.tensor(A, dtype=torch.float32, device=device).view(-1, 1)
            A = torch.tensor(A, dtype=torch.float32,
                             device=device).view(-1, self.act_dim)
            # print (f'A shape {A.shape}')
            M = torch.tensor(M, dtype=torch.float32).view(-1, 1)
            S_ = torch.tensor(S_, dtype=torch.float32, device=device)

            # print (f'size S:{S.size()}, A:{A.size()}, M:{M.size()}, R:{R.size()}, S_:{S_.size()}')
            if self.use_dist:
                # print (M[0].size())
                # print (M[0])
                # print (M[0].item())
                # assert 0
                # D = torch.from_numpy(np.array([1^int(mask.item()) for mask in M])).view(-1, 1)
                # print (f'size S:{S.shape}, A:{A.size()}, M:{M.size()}, R:{R.size()}, S_:{S_.size()}, D:{D.size()}')
                # assert 0
                batch_loss = self.learn_dist(S, A, R, S_, M)

            else:
                with torch.no_grad():
                    next_A, next_log = self.actor_target.evaluate(S_)
                    q1_next, q2_next = self.critic_target(S_, next_A)
                    q_next = torch.min(q1_next,
                                       q2_next) - self.alpha * next_log
                    q_target = R + M * self._gamma * q_next.cpu()
                    q_target = q_target.to(device)
                # q_loss
                q1_eval, q2_eval = self.critic_eval(S, A)
                loss1 = self.criterion(q1_eval, q_target)
                loss2 = self.criterion(q2_eval, q_target)
                # print(f'q_eval {q1_eval.shape}, q_target {q_target.shape}')
                batch_loss = 0.5 * (loss1 + loss2)

            if self.use_priority:
                critic_loss = (W * batch_loss).mean()
                self.buffer.update_priorities(
                    tree_idxs,
                    np.abs(batch_loss.detach().cpu().numpy()) + 1e-6)
            else:
                critic_loss = batch_loss.mean()

            self.critic_eval_optim.zero_grad()
            critic_loss.backward()
            self.critic_eval_optim.step()
            self._learn_critic_cnt += 1

            actor_loss = torch.tensor(0)
            alpha_loss = torch.tensor(0)
            if self._learn_critic_cnt % self.actor_learn_freq == 0:
                curr_A, curr_log = self.actor_eval.evaluate(S)
                if self.use_dist:
                    z1_next, z2_next = self.critic_eval.get_probs(S, curr_A)
                    p_next = torch.stack([
                        torch.where(z1.sum() < z2.sum(), z1, z2)
                        for z1, z2 in zip(z1_next, z2_next)
                    ])
                    num_atoms = torch.tensor(self.num_atoms,
                                             dtype=torch.float32,
                                             device=device)
                    # actor_loss = p_next * num_atoms
                    # actor_loss = torch.sum(actor_loss, dim=1)
                    # actor_loss = -(actor_loss + self.alpha * curr_log).mean()
                    actor_loss = (self.alpha * curr_log - p_next)
                    actor_loss = torch.sum(actor_loss, dim=1)
                    actor_loss = actor_loss.mean()
                else:
                    q1_next, q2_next = self.critic_eval(S, curr_A)
                    q_next = torch.min(q1_next, q2_next)
                    # pg_loss
                    actor_loss = (self.alpha * curr_log - q_next).mean()

                self.actor_eval_optim.zero_grad()
                actor_loss.backward()
                self.actor_eval_optim.step()

                # alpha loss
                alpha_loss = -(
                    self.log_alpha *
                    (curr_log + self.target_entropy).detach()).mean()
                self.alpha_optim.zero_grad()
                alpha_loss.backward()
                self.alpha_optim.step()
                self.alpha = float(self.log_alpha.exp().detach().cpu().numpy())

            q_loss += critic_loss.item()
            pg_loss += actor_loss.item()
            a_loss += alpha_loss.item()

            if self._learn_critic_cnt % self.target_update_freq:
                self.soft_sync_weight(self.critic_target, self.critic_eval,
                                      self.tau)
                self.soft_sync_weight(self.actor_target, self.actor_eval,
                                      self.tau)

        return pg_loss, q_loss, a_loss
Ejemplo n.º 27
0
class DDPGPolicy(BasePolicy):
    def __init__(self,
                 actor_net,
                 critic_net,
                 buffer_size=1000,
                 actor_learn_freq=1,
                 target_update_freq=0,
                 target_update_tau=5e-3,
                 learning_rate=0.01,
                 discount_factor=0.99,
                 batch_size=100,
                 verbose=False):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._target = target_update_freq > 0
        self._update_iteration = 10
        self._sync_cnt = 0
        # self._learn_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = verbose
        self._batch_size = batch_size
        self.replay_buffer = ReplayBuffer(buffer_size)
        # assert buffer.allow_replay, 'DDPG buffer must be replay buffer'

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.actor_eval = actor_net.to(self.device)  # pi(s)
        self.critic_eval = critic_net.to(self.device)  # Q(s, a)
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)

        self.actor_eval.train()
        self.critic_eval.train()

        if self._target:
            self.actor_target = deepcopy(self.actor_eval)
            self.critic_target = deepcopy(self.critic_eval)
            self.actor_target.load_state_dict(self.actor_eval.state_dict())
            self.critic_target.load_state_dict(self.critic_eval.state_dict())

            self.actor_target.eval()
            self.critic_target.eval()

        self.criterion = nn.MSELoss()  # why mse?

    def choose_action(self, state, test=False):
        state = torch.tensor(state, dtype=torch.float32, device=self.device)
        if test:
            self.actor_eval.eval()
        action = self.actor_eval(state)  # out = tanh(x)
        action = action.clamp(-1, 1)

        return action.item()

    def learn(self):

        loss_actor_avg = 0
        loss_critic_avg = 0

        for _ in range(self._update_iteration):
            memory_batch = self.replay_buffer.random_sample(self._batch_size)
            batch_split = self.replay_buffer.split(memory_batch)

            S = torch.tensor(
                batch_split['s'], dtype=torch.float32,
                device=self.device)  # [batch_size, S.feature_size]
            A = torch.tensor(batch_split['a'],
                             dtype=torch.float32,
                             device=self.device).unsqueeze(
                                 -1)  # [batch_size, 1]
            S_ = torch.tensor(batch_split['s_'],
                              dtype=torch.float32,
                              device=self.device)
            R = torch.tensor(batch_split['r'],
                             dtype=torch.float32,
                             device=self.device).unsqueeze(-1)

            with torch.no_grad():
                q_target = self.critic_eval(S_, self.actor_eval(S_))
                if self._target:
                    q_target = self.critic_target(S_, self.actor_target(S_))
                q_target = R + self._gamma * q_target
            print(
                f'SIZE S {S.size()}, A {A.size()}, S_ {S_.size()}, R {R.size()}'
            )
            q_eval = self.critic_eval(S, A)  # [batch_size, q_value_size]
            critic_loss = self.criterion(q_eval, q_target)
            loss_critic_avg += critic_loss.item()

            self.critic_eval_optim.zero_grad()
            critic_loss.backward()
            self.critic_eval_optim.step()
            self._learn_critic_cnt += 1

            if self._learn_critic_cnt % self.actor_learn_freq == 0:
                actor_loss = -self.critic_eval(S, self.actor_eval(S)).mean()
                loss_actor_avg += actor_loss.item()

                self.actor_eval_optim.zero_grad()
                actor_loss.backward()
                self.actor_eval_optim.step()
                self._learn_actor_cnt += 1
                if self._verbose: print(f'=======Learn_Actort_Net=======')

            if self._target:
                if self._learn_critic_cnt % self.target_update_freq == 0:
                    if self._verbose:
                        print(f'=======Soft_sync_weight of DDPG=======')
                    self.soft_sync_weight(self.critic_target, self.critic_eval,
                                          self.tau)
                    self.soft_sync_weight(self.actor_target, self.actor_eval,
                                          self.tau)

        loss_actor_avg /= (self._update_iteration / self.actor_learn_freq)
        loss_critic_avg /= self._update_iteration

        return loss_actor_avg, loss_critic_avg

    def process(self, **kwargs):
        self.replay_buffer.append(**kwargs)
Ejemplo n.º 28
0
class SAC(BasePolicy): # combiane SAC1 and SAC2
    def __init__(
        self, 
        model,
        buffer_size=1000,
        batch_size=100,
        actor_learn_freq=1,
        target_update_freq=5,
        target_update_tau=1e-2,
        learning_rate=1e-3,
        discount_factor=0.99,
        verbose=False,
        update_iteration=10,
        act_dim=None,
        alpha=None # default: auto_entropy_tuning
        ):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._target = target_update_freq > 0
        self._update_iteration = update_iteration
        self._sync_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = verbose
        self._batch_size = batch_size
        self.buffer = ReplayBuffer(buffer_size) # off-policy

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()

        self.actor_target = self.copy_net(self.actor_eval)
        self.critic_target = self.copy_net(self.critic_eval)
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr)
        
        self.criterion = nn.SmoothL1Loss()
        self.act_dim = act_dim
        self.alpha = alpha
        self.auto_entropy_tuning = True

        if self.alpha:
            self.auto_entropy_tuning = False
            self.value_eval = model.v_net.to(device).train()
            self.value_target = self.copy_net(self.value_eval)
            self.value_eval_optim = optim.Adam(self.value_eval.parameters(), lr=self.lr)
        else:
            self.target_entropy = -torch.tensor(1).to(device)
            self.log_alpha = torch.zeros(1, requires_grad=True, device=device)
            self.alpha_optim = optim.Adam([self.log_alpha], lr=self.lr)
            self.alpha = self.log_alpha.exp()

    def learn(self):
        pg_loss, q_loss, a_loss = 0, 0, 0
        for _ in range(self._update_iteration):
            batch = self.buffer.split_batch(self._batch_size)
            if self.act_dim is None:
                self.act_dim = np.array(batch['a']).shape[-1]
                if self.auto_entropy_tuning:
                    self.target_entropy = -torch.tensor(self.act_dim).to(device)

            S = torch.tensor(batch['s'], dtype=torch.float32, device=device)
            A = torch.tensor(batch['a'], dtype=torch.float32, device=device).view(-1, self.act_dim)
            M = torch.tensor(batch['m'], dtype=torch.float32).view(-1, 1)
            R = torch.tensor(batch['r'], dtype=torch.float32).view(-1, 1)
            S_ = torch.tensor(batch['s_'], dtype=torch.float32, device=device)
            if self._verbose:
                print(f'shape S:{S.size()}, A:{A.size()}, M:{M.size()}, R:{R.size()}, S_:{S_.size()}, W:{W.size()}')
            
            if self.auto_entropy_tuning:
                with torch.no_grad():
                    next_A, next_log = self.actor_target.evaluate(S_)
                    q1_next, q2_next = self.critic_target(S_, next_A)
                    q_next = torch.min(q1_next, q2_next) - self.alpha * next_log
            else:
                curr_A, curr_log = self.actor_eval.evaluate(S)

                # v_loss
                with torch.no_grad():
                    q1, q2 = self.critic_eval(S, curr_A)
                    v_target = torch.min(q1, q2) - self.alpha * curr_log
                    q_next = self.value_target(S_)

                v_eval = self.value_eval(S)
                value_loss = self.criterion(v_eval, v_target)
                # update V
                self.value_eval_optim.zero_grad()
                value_loss.backward()
                self.value_eval_optim.step()
                
            q_target = R + M * self._gamma * q_next.cpu()
            q_target = q_target.to(device)

            q1_eval, q2_eval = self.critic_eval(S, A)
            loss1 = self.criterion(q1_eval, q_target)
            loss2 = self.criterion(q2_eval, q_target)
            critic_loss = 0.5 * (loss1 + loss2)

            # update soft Q
            self.critic_eval_optim.zero_grad()
            critic_loss.backward()
            self.critic_eval_optim.step()
            self._learn_critic_cnt += 1

            actor_loss = torch.tensor(0)
            alpha_loss = torch.tensor(0)
            if self._learn_critic_cnt % self.actor_learn_freq == 0:
                if self.auto_entropy_tuning:
                    curr_A, curr_log = self.actor_eval.evaluate(S)
                    q1_next, q2_next = self.critic_eval(S, curr_A)
                    q_eval_next = torch.min(q1_next, q2_next)

                    # alpha loss
                    alpha_loss = -(self.log_alpha * (curr_log + self.target_entropy).detach()).mean()
                    self.alpha_optim.zero_grad()
                    alpha_loss.backward()
                    self.alpha_optim.step()
                    self.alpha = float(self.log_alpha.exp().detach().cpu().numpy())
                else:
                    q_eval_next = torch.min(q1, q2)
                # pg_loss
                actor_loss = (self.alpha * curr_log - q_eval_next).mean()
                self.actor_eval_optim.zero_grad()
                actor_loss.backward()
                self.actor_eval_optim.step()

            if self._learn_critic_cnt % self.target_update_freq:
                if self.auto_entropy_tuning:
                    self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau)
                    self.soft_sync_weight(self.actor_target, self.actor_eval, self.tau)
                else:
                    self.soft_sync_weight(self.value_target, self.value_eval, self.tau)

            q_loss += critic_loss.item()
            pg_loss += actor_loss.item()
            if self.auto_entropy_tuning:
                loss += alpha_loss.item()
            else:
                loss += value_loss.item() 
        
        return pg_loss, q_loss, loss
Ejemplo n.º 29
0
class SAC2(BasePolicy):  # no value network
    def __init__(
        self,
        model,
        buffer_size=1000,
        batch_size=100,
        actor_learn_freq=1,
        target_update_freq=5,
        target_update_tau=0.01,
        learning_rate=1e-3,
        discount_factor=0.99,
        verbose=False,
        update_iteration=10,
    ):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._target = target_update_freq > 0
        self._update_iteration = update_iteration
        self._sync_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0
        self._verbose = verbose
        self._batch_size = batch_size
        self.buffer = ReplayBuffer(buffer_size)  # off-policy

        self.actor_eval = model.policy_net.to(device).train()
        self.critic_eval = model.value_net.to(device).train()

        self.actor_target = self.copy_net(self.actor_eval)
        self.critic_target = self.copy_net(self.critic_eval)

        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)

        self.criterion = nn.SmoothL1Loss()

        self.target_entropy = -torch.tensor(1).to(device)
        self.log_alpha = torch.zeros(1, requires_grad=True, device=device)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=self.lr)
        self.alpha = self.log_alpha.exp()

    def learn(self):
        pg_loss, q_loss, a_loss = 0, 0, 0
        for _ in range(self._update_iteration):
            batch_split = self.buffer.split_batch(self._batch_size)
            S = torch.tensor(batch_split['s'],
                             dtype=torch.float32,
                             device=device)
            A = torch.tensor(batch_split['a'],
                             dtype=torch.float32,
                             device=device).view(-1, 1)
            M = torch.tensor(batch_split['m'], dtype=torch.float32).view(-1, 1)
            R = torch.tensor(batch_split['r'], dtype=torch.float32).view(-1, 1)
            S_ = torch.tensor(batch_split['s_'],
                              dtype=torch.float32,
                              device=device)

            # print (f'size S:{S.size()}, A:{A.size()}, M:{M.size()}, R:{R.size()}, S_:{S_.size()}, W:{W.size()}')
            with torch.no_grad():
                next_A, next_log = self.actor_target.evaluate(S_)
                q1_next, q2_next = self.critic_target(S_, next_A)
                q_next = torch.min(q1_next, q2_next) - self.alpha * next_log
                q_target = R + M * self._gamma * q_next.cpu()
                q_target = q_target.to(device)

            # q_loss
            q1_eval, q2_eval = self.critic_eval(S, A)
            critic_loss = self.criterion(q1_eval, q_target) + self.criterion(
                q2_eval, q_target)

            self.critic_eval_optim.zero_grad()
            critic_loss.backward()
            self.critic_eval_optim.step()
            self._learn_critic_cnt += 1

            actor_loss = torch.tensor(0)
            alpha_loss = torch.tensor(0)
            if self._learn_critic_cnt % self.actor_learn_freq == 0:
                curr_A, curr_log = self.actor_eval.evaluate(S)
                q1_next, q2_next = self.critic_eval(S, curr_A)
                q_next = torch.min(q1_next, q2_next)

                # pg_loss
                actor_loss = (self.alpha * curr_log - q_next).mean()
                self.actor_eval_optim.zero_grad()
                actor_loss.backward()
                self.actor_eval_optim.step()

                # alpha loss
                alpha_loss = -(
                    self.log_alpha *
                    (curr_log + self.target_entropy).detach()).mean()
                self.alpha_optim.zero_grad()
                alpha_loss.backward()
                self.alpha_optim.step()
                self.alpha = float(self.log_alpha.exp().detach().cpu().numpy())

            q_loss += critic_loss.item() * 0.5
            pg_loss += actor_loss.item()
            a_loss += alpha_loss.item()

            if self._learn_critic_cnt % self.target_update_freq:
                self.soft_sync_weight(self.critic_target, self.critic_eval,
                                      self.tau)
                self.soft_sync_weight(self.actor_target, self.actor_eval,
                                      self.tau)

        return pg_loss, q_loss, a_loss
Ejemplo n.º 30
0
class PPOPolicy(BasePolicy):  #option: double
    def __init__(self,
                 actor_net,
                 critic_net,
                 buffer_size=1000,
                 actor_learn_freq=1,
                 target_update_freq=0,
                 target_update_tau=5e-3,
                 learning_rate=0.0001,
                 discount_factor=0.99,
                 batch_size=100,
                 verbose=False):
        super().__init__()
        self.lr = learning_rate
        self.eps = np.finfo(np.float32).eps.item()
        self.tau = target_update_tau
        self.ratio_clip = 0.2
        self.lam_entropy = 0.01
        self.adv_norm = True
        self.rew_norm = False
        self.schedule_clip = False
        self.schedule_adam = False

        self.actor_learn_freq = actor_learn_freq
        self.target_update_freq = target_update_freq
        self._gamma = discount_factor
        self._target = target_update_freq > 0
        self._update_iteration = 10
        self._sync_cnt = 0
        # self._learn_cnt = 0
        self._learn_critic_cnt = 0
        self._learn_actor_cnt = 0

        self._verbose = verbose
        self._batch_size = batch_size
        self.buffer = ReplayBuffer(buffer_size, replay=False)
        # assert not self.buffer.allow_replay, 'PPO buffer cannot be replay buffer'
        self._normalized = lambda x, e: (x - x.mean()) / (x.std() + e)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.actor_eval = actor_net.to(self.device)
        self.critic_eval = critic_net.to(self.device)
        self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(),
                                           lr=self.lr)
        self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(),
                                            lr=self.lr)

        self.actor_eval.train()
        self.critic_eval.train()

        if self._target:
            self.actor_target = deepcopy(self.actor_eval)
            self.critic_target = deepcopy(self.critic_eval)
            self.actor_target.load_state_dict(self.actor_eval.state_dict())
            self.critic_target.load_state_dict(self.critic_eval.state_dict())

            self.actor_target.eval()
            self.critic_target.eval()

        self.criterion = nn.SmoothL1Loss()

    def choose_action(self, state, test=False):
        state = torch.tensor(state, dtype=torch.float32, device=self.device)
        if test:
            self.actor_eval.eval()
        with torch.no_grad():
            mu, sigma = self.actor_eval(state)
        dist = Normal(mu, sigma)
        action = dist.sample()
        # print (f'mu:{mu}, sigma:{sigma}, dist: {dist}, action sample before clamp: {action}')
        action = action.clamp(-2, 2)
        # print (f'action after clamp {action}')
        log_prob = dist.log_prob(action)
        assert abs(action.item()) <= 2, f'ERROR: action out of {action}'

        return action.item(), log_prob.item()

    def get_batchs_indices(self,
                           buffer_size,
                           batch_size,
                           replace=True,
                           batch_num=None):
        indices = [i for i in range(buffer_size)]
        if replace:  # 有放回的采样
            if not batch_num:
                batch_num = round(buffer_size / batch_size + 0.5) * 2
            return [
                np.random.choice(indices, batch_size, replace=False)
                for _ in range(batch_num)
            ]
        else:  # 无放回的采样
            np.random.shuffle(indices)
            return [
                indices[i:i + batch_size]
                for i in range(0, buffer_size, batch_size)
            ]

    def learn(self, i_episode=0, num_episode=100):
        if not self.buffer.is_full():
            print(
                f'Waiting for a full buffer: {len(self.buffer)}\{self.buffer.capacity()} ',
                end='\r')
            return 0, 0

        loss_actor_avg = 0
        loss_critic_avg = 0

        memory_split = self.buffer.split(self.buffer.all_memory())
        S = torch.tensor(memory_split['s'],
                         dtype=torch.float32,
                         device=self.device)
        A = torch.tensor(memory_split['a'],
                         dtype=torch.float32,
                         device=self.device).view(-1, 1)
        S_ = torch.tensor(memory_split['s_'],
                          dtype=torch.float32,
                          device=self.device)
        R = torch.tensor(memory_split['r'], dtype=torch.float32).view(-1, 1)
        Log = torch.tensor(memory_split['l'],
                           dtype=torch.float32,
                           device=self.device).view(-1, 1)

        # print (f'Size S {S.size()}, A {A.size()}, S_ {S_.size()}, R {R.size()}, Log {Log.size()}')
        # print (f'S {S}, A {A}, S_ {S_}, R {R}, Log {Log}')
        with torch.no_grad():
            v_evals = self.critic_eval(S).cpu().numpy()
            end_v_eval = self.critic_eval(S_[-1]).cpu().numpy()

        rewards = self._normalized(
            R, self.eps).numpy() if self.rew_norm else R.numpy()
        # rewards = rewards.cpu().numpy()
        adv_gae_td = self.GAE(rewards,
                              v_evals,
                              next_v_eval=end_v_eval,
                              gamma=self._gamma,
                              lam=0)  # td_error adv
        advantage = torch.from_numpy(adv_gae_td).to(self.device).unsqueeze(-1)
        advantage = self._normalized(advantage,
                                     1e-10) if self.adv_norm else advantage

        # indices = [i for i in range(len(self.buffer))]
        for _ in range(self._update_iteration):

            v_eval = self.critic_eval(S)
            v_target = advantage + v_eval.detach()

            critic_loss = self.criterion(v_eval, v_target)
            loss_critic_avg += critic_loss.item()

            self.critic_eval_optim.zero_grad()
            critic_loss.backward()
            self.critic_eval_optim.step()
            self._learn_critic_cnt += 1

            if self._learn_critic_cnt % self.actor_learn_freq == 0:
                # actor_core
                mu, sigma = self.actor_eval(S)
                dist = Normal(mu, sigma)
                new_log_prob = dist.log_prob(A)

                pg_ratio = torch.exp(new_log_prob -
                                     Log)  # size = [batch_size, 1]
                clipped_pg_ratio = torch.clamp(pg_ratio, 1.0 - self.ratio_clip,
                                               1.0 + self.ratio_clip)

                surrogate_loss = -torch.min(
                    pg_ratio * advantage, clipped_pg_ratio * advantage).mean()

                # policy entropy
                loss_entropy = -torch.mean(
                    torch.exp(new_log_prob) * new_log_prob)

                actor_loss = surrogate_loss - self.lam_entropy * loss_entropy

                loss_actor_avg += actor_loss.item()

                self.actor_eval_optim.zero_grad()
                actor_loss.backward()
                self.actor_eval_optim.step()
                self._learn_actor_cnt += 1
                if self._verbose: print(f'=======Learn_Actort_Net=======')

            if self._target:
                if self._learn_critic_cnt % self.target_update_freq == 0:
                    if self._verbose:
                        print(f'=======Soft_sync_weight of DDPG=======')
                    self.soft_sync_weight(self.critic_target, self.critic_eval,
                                          self.tau)
                    self.soft_sync_weight(self.actor_target, self.actor_eval,
                                          self.tau)

        self.buffer.clear()
        assert self.buffer.is_empty()

        # update param
        ep_ratio = 1 - (i_episode / num_episode)
        if self.schedule_clip:
            self.ratio_clip = 0.2 * ep_ratio

        if self.schedule_adam:
            new_lr = self.lr * ep_ratio
            # set learning rate
            # ref: https://stackoverflow.com/questions/48324152/
            for g in self.actor_eval_optim.param_groups:
                g['lr'] = new_lr
            for g in self.critic_eval_optim.param_groups:
                g['lr'] = new_lr

        print(
            f'critic_cnt {self._learn_critic_cnt}, actor_cnt {self._learn_actor_cnt}'
        )
        loss_actor_avg /= (self._update_iteration / self.actor_learn_freq)
        loss_critic_avg /= self._update_iteration

        return loss_actor_avg, loss_critic_avg

    def process(self, **kwargs):
        self.buffer.append(**kwargs)