Example #1
0
    def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4):
        super().__init__()
        self.explore_noise = 0.1  # standard deviation of explore noise
        self.policy_noise = 0.2  # standard deviation of policy noise
        self.update_freq = 2  # delay update frequency, for soft target update

        self.act = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target.load_state_dict(self.act.state_dict())

        self.cri = CriticTwin(net_dim, state_dim, action_dim).to(self.device)
        self.cri_target = Critic(net_dim, state_dim,
                                 action_dim).to(self.device)
        self.cri_target.load_state_dict(self.cri.state_dict())

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam([
            {
                'params': self.act.parameters(),
                'lr': learning_rate
            },
            {
                'params': self.cri.parameters(),
                'lr': learning_rate
            },
        ],
                                          lr=learning_rate)
Example #2
0
    def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4):
        AgentBase.__init__(self)
        self.explore_noise = 0.05

        self.act = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target.load_state_dict(self.act.state_dict())

        self.cri = Critic(net_dim, state_dim, action_dim).to(self.device)
        self.cri_target = Critic(net_dim, state_dim,
                                 action_dim).to(self.device)
        self.cri_target.load_state_dict(self.cri.state_dict())

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam([
            {
                'params': self.act.parameters(),
                'lr': learning_rate
            },
            {
                'params': self.cri.parameters(),
                'lr': learning_rate
            },
        ],
                                          lr=learning_rate)
Example #3
0
	def __init__(self, cfg):
		self.critic_eval = Critic(cfg.n_state, cfg.n_action, cfg.mid_critic)
		self.critic_pred = Critic(cfg.n_state, cfg.n_action, cfg.mid_critic)
		self.actor_eval = Actor(cfg.n_state, cfg.n_action, cfg.mid_actor)
		self.actor_pred = Actor(cfg.n_state, cfg.n_action, cfg.mid_actor)
		hard_update(self.actor_pred, self.actor_eval)
		hard_update(self.critic_pred, self.critic_eval)
		self.noise = OUANoise()
		self.cfg = cfg
		self.epsilon = cfg.epsilon
Example #4
0
class AgentDDPG(AgentBase):
    def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4):
        super().__init__()
        self.explore_noise = 0.05

        self.act = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target.load_state_dict(self.act.state_dict())

        self.cri = Critic(net_dim, state_dim, action_dim).to(self.device)
        self.cri_target = Critic(net_dim, state_dim,
                                 action_dim).to(self.device)
        self.cri_target.load_state_dict(self.cri.state_dict())

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam([{
            'params': self.act.parameters(),
            'lr': learning_rate
        }, {
            'params': self.cri.parameters(),
            'lr': learning_rate
        }])

    def select_actions(self, states):  # states = (state, ...)
        states = torch.as_tensor(states,
                                 dtype=torch.float32,
                                 device=self.device)
        actions = self.act(states)
        actions = (actions +
                   torch.randn_like(actions) * self.explore_noise).clamp(
                       -1, 1)
        return actions.detach().cpu().numpy()

    def update_policy(self, buffer, max_step, batch_size, repeat_times):
        buffer.update__now_len__before_sample()

        critic_obj = actor_obj = None  # just for print return
        for _ in range(int(max_step * repeat_times)):
            with torch.no_grad():
                reward, mask, state, action, next_state = buffer.random_sample(
                    batch_size)

                next_a = self.act_target(next_state)
                next_q = self.cri_target(next_state, next_a)
                q_label = reward + mask * next_q

            q_value = self.cri(state, action)
            critic_obj = self.criterion(q_value, q_label)

            q_value_pg = self.act(state)  # policy gradient
            actor_obj = -self.cri_target(state, q_value_pg).mean()

            united_obj = actor_obj + critic_obj  # objective
            self.optimizer.zero_grad()
            united_obj.backward()
            self.optimizer.step()

            soft_target_update(self.cri_target, self.cri)
            soft_target_update(self.act_target, self.act)

        self.obj_a = actor_obj.item()
        self.obj_c = critic_obj.item()
Example #5
0
class AgentTD3(AgentBase):
    def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4):
        super().__init__()
        self.explore_noise = 0.1  # standard deviation of explore noise
        self.policy_noise = 0.2  # standard deviation of policy noise
        self.update_freq = 2  # delay update frequency, for soft target update

        self.act = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target = Actor(net_dim, state_dim, action_dim).to(self.device)
        self.act_target.load_state_dict(self.act.state_dict())

        self.cri = CriticTwin(net_dim, state_dim, action_dim).to(self.device)
        self.cri_target = Critic(net_dim, state_dim,
                                 action_dim).to(self.device)
        self.cri_target.load_state_dict(self.cri.state_dict())

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam([
            {
                'params': self.act.parameters(),
                'lr': learning_rate
            },
            {
                'params': self.cri.parameters(),
                'lr': learning_rate
            },
        ],
                                          lr=learning_rate)

    def update_policy(self, buffer, max_step, batch_size, repeat_times):
        buffer.update__now_len__before_sample()

        critic_obj = actor_obj = None
        for i in range(int(max_step * repeat_times)):
            with torch.no_grad():
                reward, mask, state, action, next_s = buffer.random_sample(
                    batch_size)

                next_a = self.act_target.get_action(
                    next_s, self.policy_noise)  # policy noise
                next_q = torch.min(*self.cri_target.get__q1_q2(
                    next_s, next_a))  # twin critics
                q_label = reward + mask * next_q

            q1, q2 = self.cri.get__q1_q2(state, action)
            critic_obj = self.criterion(q1, q_label) + self.criterion(
                q2, q_label)  # twin critics

            q_value_pg = self.act(state)  # policy gradient
            actor_obj = -self.cri_target(state, q_value_pg).mean()

            united_obj = actor_obj + critic_obj  # objective
            self.optimizer.zero_grad()
            united_obj.backward()
            self.optimizer.step()

            if i % self.update_freq == 0:  # delay update
                soft_target_update(self.cri_target, self.cri)
                soft_target_update(self.act_target, self.act)

        self.obj_a = actor_obj.item()
        self.obj_c = critic_obj.item()