Beispiel #1
0
    def init(self, net_dim, state_dim, action_dim):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.act = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.cri = Critic(net_dim, state_dim, action_dim).to(self.device)
        self.cri_target = Critic(net_dim, state_dim, action_dim).to(self.device)

        self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate)
        self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate)
Beispiel #2
0
    def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4):
        super().__init__()
        self.explore_noise = 0.05  # explore noise of action

        self.act = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target = deepcopy(self.act)
        self.cri = Critic(net_dim, state_dim, action_dim).to(self.device)
        self.cri_target = deepcopy(self.cri)

        self.criterion = torch.nn.SmoothL1Loss()
        self.optimizer = torch.optim.Adam([{
            'params': self.act.parameters(),
            'lr': learning_rate
        }, {
            'params': self.cri.parameters(),
            'lr': learning_rate
        }])
Beispiel #3
0
class AgentDDPG(AgentBase):
    def __init__(self):
        super().__init__()
        self.explore_noise = 0.1  # explore noise of action

    def init(self, net_dim, state_dim, action_dim):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.act = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target = deepcopy(self.act)
        self.cri = Critic(net_dim, state_dim, action_dim).to(self.device)
        self.cri_target = deepcopy(self.cri)

        self.criterion = torch.nn.MSELoss()
        self.act_optimizer = torch.optim.Adam(self.act.parameters(),
                                              lr=self.learning_rate)
        self.cri_optimizer = torch.optim.Adam(self.cri.parameters(),
                                              lr=self.learning_rate)

    def select_action(self, state):
        states = torch.as_tensor((state, ),
                                 dtype=torch.float32,
                                 device=self.device).detach_()
        action = self.act(states)[0]
        action = action + torch.randn_like(action) * self.explore_noise
        return action.cpu().numpy()

    def update_net(self, buffer, target_step, batch_size, repeat_times):
        buffer.update_now_len_before_sample()

        obj_critic = obj_actor = None
        for _ in range(int(target_step * repeat_times)):
            with torch.no_grad():
                reward, mask, action, state, next_s = buffer.sample_batch(
                    batch_size)
                next_q = self.cri_target(next_s, self.act_target(next_s))
                q_label = reward + mask * next_q
            q_value = self.cri(state, action)
            obj_critic = self.criterion(q_value, q_label)

            self.cri_optimizer.zero_grad()
            obj_critic.backward()
            self.cri_optimizer.step()
            self.soft_update(self.cri_target, self.cri)

            q_value_pg = self.act(state)  # policy gradient
            obj_actor = -self.cri_target(state, q_value_pg).mean()

            self.act_optimizer.zero_grad()
            obj_actor.backward()
            self.act_optimizer.step()
            self.soft_update(self.act_target, self.act)
        return obj_actor.item(), obj_critic.item()
Beispiel #4
0
class AgentDDPG(AgentBase):
    def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4):
        super().__init__()
        self.explore_noise = 0.05  # explore noise of action

        self.act = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target = deepcopy(self.act)
        self.cri = Critic(net_dim, state_dim, action_dim).to(self.device)
        self.cri_target = deepcopy(self.cri)

        self.criterion = torch.nn.SmoothL1Loss()
        self.optimizer = torch.optim.Adam([{
            'params': self.act.parameters(),
            'lr': learning_rate
        }, {
            'params': self.cri.parameters(),
            'lr': learning_rate
        }])

    def select_actions(self, states):  # states = (state, ...)
        states = torch.as_tensor(states,
                                 dtype=torch.float32,
                                 device=self.device)
        actions = self.act(states)
        actions = (actions +
                   torch.randn_like(actions) * self.explore_noise).clamp(
                       -1, 1)
        return actions.detach().cpu().numpy()

    def update_net(self, buffer, max_step, batch_size, repeat_times):
        buffer.update__now_len__before_sample()
        obj_critic = obj_actor = None  # just for print return
        for _ in range(int(max_step * repeat_times)):
            with torch.no_grad():
                reward, mask, action, state, next_s = buffer.random_sample(
                    batch_size)
                next_q = self.cri_target(next_s, self.act_target(next_s))
                q_label = reward + mask * next_q
            q_value = self.cri(state, action)
            obj_critic = self.criterion(q_value, q_label)

            q_value_pg = self.act(state)  # policy gradient
            obj_actor = -self.cri_target(state, q_value_pg).mean()

            obj_united = obj_actor + obj_critic  # objective
            self.optimizer.zero_grad()
            obj_united.backward()
            self.optimizer.step()

            soft_target_update(self.cri_target, self.cri)
            soft_target_update(self.act_target, self.act)
        return obj_actor.item(), obj_critic.item()