Ejemplo n.º 1
0
class DDPG:
    def __init__(self, state_space, action_space):
        self.actor = Actor(state_space, action_space).to(device)
        self.critic = Critic(state_space, action_space).to(device)

        self.actor_target = Actor(state_space, action_space).to(device)
        self.critic_target = Critic(state_space, action_space).to(device)

        self.actor_optimiser = optim.Adam(actor.parameters(), lr=1e-3)
        self.critic_optimiser = optim.Adam(critic.parameters(), lr=1e-3)

        self.mem = ReplayBuffer(buffer_size)

    def act(self, state, add_noise=False):
        return self.actor.act(state, add_noise)

    def save(self, fn):
        torch.save(self.actor.state_dict(), "{}_actor_model.pth".format(fn))
        torch.save(self.critic.state_dict(), "{}_critic_model.pth".format(fn))

    def learn(self):

        state_batch, action_batch, reward_batch, next_state_batch, masks = self.mem.sample(
            batch_size)

        state_batch = torch.FloatTensor(state_batch).to(device)
        action_batch = torch.FloatTensor(action_batch).to(device)
        reward_batch = torch.FloatTensor(reward_batch).to(device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(device)
        masks = torch.FloatTensor(masks).to(device)

        # Update Critic
        self.update_critic(states=state_batch,
                           next_states=next_state_batch,
                           actions=action_batch,
                           rewards=reward_batch,
                           dones=masks)

        # Update actor
        self.update_actor(states=state_batch)

        # Update target networks
        self.update_target_networks()

    def update_actor(self, states):
        actions_pred = self.actor(states)
        loss = -self.critic(states, actions_pred).mean()

        self.actor_optimiser.zero_grad()
        loss.backward()
        self.actor_optimiser.step()

    def update_critic(self, states, next_states, actions, rewards, dones):
        next_actions = self.actor_target.forward(next_states)

        y_i = rewards + (gamma *
                         self.critic_target(next_states, next_actions) *
                         (1 - dones))
        expected_Q = self.critic(states, actions)

        loss = F.mse_loss(y_i, expected_Q)

        self.critic_optimiser.zero_grad()
        loss.backward()
        self.critic_optimiser.step()

    def update_target_networks(self):
        for target, local in zip(self.actor_target.parameters(),
                                 self.actor.parameters()):
            target.data.copy_(tau * local.data + (1.0 - tau) * target.data)

        for target, local in zip(self.critic_target.parameters(),
                                 self.critic.parameters()):
            target.data.copy_(tau * local.data + (1.0 - tau) * target.data)
Ejemplo n.º 2
0
class PPO:
    def __init__(self,
                 path,
                 s_dim=3,
                 a_dim=1,
                 hidden=64,
                 actor_lr=1e-4,
                 critic_lr=1e-4,
                 memory_len=64,
                 batch_size=32,
                 update_epoch=10,
                 gamma=0.9,
                 lambda_=0.95,
                 epsilon=0.2):
        # Parameter initialization
        self.path = path
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.hidden = hidden
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.memory_len = memory_len
        self.batch_size = batch_size
        self.update_epoch = update_epoch
        self.gamma = gamma
        self.lambda_ = lambda_
        self.epsilon = epsilon

        # network initialization
        self.actor = Actor(s_dim, a_dim, hidden)
        self.actor_old = Actor(s_dim, a_dim, hidden)
        self.actor_opt = torch.optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)
        self.critic = Critic(s_dim, hidden)
        self.critic_opt = torch.optim.Adam(self.critic.parameters(),
                                           lr=self.critic_lr)

        # memory initialization
        self.memory_s, self.memory_a, self.memory_s_, self.memory_r, self.memory_done = [], [], [], [], []

        # 是否继承以前的成果
        if not os.listdir(self.path + '/Net'):
            # 没有以前的东西可以继承
            print('init completed')
        else:
            # 继承以前的网络与记忆
            print('loading completed')
            self.actor.load_state_dict(torch.load(self.path +
                                                  '/Net/Actor.pth'))
            self.critic.load_state_dict(
                torch.load(self.path + '/Net/Critic.pth'))
            with open(self.path + '/Net/Memory_s.json', 'r') as f:
                self.memory_s = json.load(f)
            with open(self.path + '/Net/Memory_a.json', 'r') as f:
                self.memory_a = json.load(f)
            with open(self.path + '/Net/Memory_s_.json', 'r') as f:
                self.memory_s_ = json.load(f)
            with open(self.path + '/Net/Memory_r.json', 'r') as f:
                self.memory_r = json.load(f)
            with open(self.path + '/Net/Memory_done.json', 'r') as f:
                self.memory_done = json.load(f)
        self.actor_old.load_state_dict(self.actor.state_dict())

    def store_network(self):
        torch.save(self.actor.state_dict(), self.path + '/Net/Actor.pth')
        torch.save(self.critic.state_dict(), self.path + '/Net/Critic.pth')
        with open(self.path + '/Net/Memory_s.json', 'w') as f:
            json.dump(self.memory_s, f)
        with open(self.path + '/Net/Memory_a.json', 'w') as f:
            json.dump(self.memory_a, f)
        with open(self.path + '/Net/Memory_s_.json', 'w') as f:
            json.dump(self.memory_s_, f)
        with open(self.path + '/Net/Memory_r.json', 'w') as f:
            json.dump(self.memory_r, f)
        with open(self.path + '/Net/Memory_done.json', 'w') as f:
            json.dump(self.memory_done, f)

    def choose_action(self, s):
        with torch.no_grad():
            s = torch.tensor(s, dtype=torch.float)
            mean, std = self.actor(s)
            cov = torch.diag_embed(std)
            dist = MultivariateNormal(loc=mean, covariance_matrix=cov)
            a = dist.sample()
            a = torch.clamp(a, -1., 1.).numpy().tolist()
        return a

    def store_transition(self, s, a, s_, r, done):
        # store transition
        self.memory_s.append(s)
        self.memory_a.append(a)
        self.memory_s_.append(s_)
        self.memory_r.append(r)
        self.memory_done.append(1 if done else 0)
        if len(self.memory_r) == self.memory_len:
            # prepare of data
            s = torch.tensor(self.memory_s,
                             dtype=torch.float)  # [memory_len, s_dim]
            a = torch.tensor(self.memory_a,
                             dtype=torch.float)  # [memory_len, 1(a_dim)]
            r = torch.tensor(self.memory_r, dtype=torch.float)  # [memory_len]
            s_ = torch.tensor(self.memory_s_,
                              dtype=torch.float)  # [memory_len, s_dim]
            done = torch.tensor(self.memory_done,
                                dtype=torch.float)  # [memory_len]
            self._learn(s, a, s_, r, done)

    def _learn(self, s, a, s_, r, done):
        gae = self._gae(s, r, s_, done)  # [memory_len, 1]
        r = self._discounted_r(r, s_, done)  # [memory_len, 1]

        # calculate old log probability
        self.actor_old.load_state_dict(self.actor.state_dict())
        old_log_prob = self._log_prob(s, a, old=True)  # [memory_len, 1]

        # batch update the network
        for i in range(self.update_epoch):
            for index in range(0, self.memory_len, self.batch_size):
                self.update_actor(s[index:index + self.batch_size],
                                  a[index:index + self.batch_size],
                                  gae[index:index + self.batch_size],
                                  old_log_prob[index:index + self.batch_size])
                self.update_critic(s[index:index + self.batch_size],
                                   r[index:index + self.batch_size])
        # empty the memory
        self.memory_s, self.memory_a, self.memory_s_, self.memory_r, self.memory_done = [], [], [], [], []

    def _log_prob(self, s, a, old=False):
        # calculate the log probability
        if old:
            with torch.no_grad():
                mean, std = self.actor_old(s)
        else:
            mean, std = self.actor(s)
        std = torch.stack([std] * mean.shape[0], dim=0)

        cov = torch.diag_embed(std)
        dist = MultivariateNormal(loc=mean, covariance_matrix=cov)
        log_prob = dist.log_prob(a).unsqueeze(dim=-1)
        return log_prob

    def _gae(self, s, r, s_, done):
        # calculate the general advantage estimation
        with torch.no_grad():
            v = self.critic(s).squeeze()  # [memory_len]
            v_ = self.critic(s_).squeeze()  # [memory_len]
            delta = r + self.gamma * v_ - v

            length = r.shape[0]
            gae = torch.zeros(size=[length])
            running_add = 0
            for t in range(length - 1, -1, -1):
                gae[t] = running_add * self.gamma * self.lambda_ * (
                    1 - done[t]) + delta[t]
                running_add = gae[t]
            return torch.unsqueeze(gae, dim=-1)

    def _discounted_r(self, r, s_, done):
        # calculate the discounted reward
        with torch.no_grad():
            length = len(r)
            discounted_r = torch.zeros(size=[length])
            v_ = self.critic(s_)
            running_add = 0
            for t in range(length - 1, -1, -1):
                if done[t] == 1 or t == length - 1:
                    discounted_r[t] = v_[t] * self.gamma + r[t]
                else:
                    discounted_r[t] = running_add * self.gamma + r[t]
                running_add = discounted_r[t]
        return discounted_r.unsqueeze(dim=-1)

    def update_actor(self, s, a, gae, old_log_prob):
        # calculate the actor loss
        log_prob = self._log_prob(s, a)
        ratio = torch.exp(log_prob - old_log_prob)
        surr1 = ratio * gae
        surr2 = torch.clamp(ratio, 1.0 - self.epsilon,
                            1.0 + self.epsilon) * gae
        loss = -torch.mean(torch.min(surr1, surr2))
        loss = loss - 0.001 * self.actor.log_std  # 这个任务当中,加入PPO是有效果的。
        # update
        self.actor_opt.zero_grad()
        loss.backward()
        self.actor_opt.step()

    def update_critic(self, s, r):
        # calculate critic loss
        v = self.critic(s)
        loss = F.mse_loss(v, r)
        # update
        self.critic_opt.zero_grad()
        loss.backward()
        self.critic_opt.step()
Ejemplo n.º 3
0
class DDPG:
    def __init__(self, s_dim, a_dim, device, hidden, capacity, batch_size,
                 lr_actor, lr_critic, variance_start, variance_decay,
                 variance_min, gamma, tau):
        # Parameter Initialization
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.device = device
        self.hidden = hidden
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.capacity = capacity
        self.batch_size = batch_size
        self.var = variance_start
        self.var_decay = variance_decay
        self.var_min = variance_min
        self.gamma = gamma
        self.tau = tau

        # Network
        self.actor = Actor(s_dim, hidden, a_dim).to(device)
        self.actor_target = Actor(s_dim, hidden, a_dim).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.opt_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic = Critic(s_dim, a_dim, hidden).to(device)
        self.critic_target = Critic(s_dim, a_dim, hidden).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.opt_critic = torch.optim.Adam(self.critic.parameters(),
                                           lr=lr_critic)

        # replay buffer, or memory
        self.memory = ReplayBuffer(capacity, batch_size, device)

    def get_action(self, s):
        with torch.no_grad():
            s = torch.FloatTensor(s).to(self.device)
            a = self.actor(s).numpy()
        a = np.clip(np.random.normal(a, self.var), -1., 1.)
        return a

    def learn(self):
        # samples from memory
        s, a, s_, r, done = self.memory.get_sample()

        # update critic
        with torch.no_grad():
            td_target = r + (1 - done) * self.gamma * self.critic_target(
                s_, self.actor_target(s_))
        q = self.critic(s, a)
        critic_loss = F.mse_loss(q, td_target)
        self.opt_critic.zero_grad()
        critic_loss.backward()
        self.opt_critic.step()

        # update actor
        q = self.critic(s, self.actor(s))
        actor_loss = -torch.mean(q)
        self.opt_actor.zero_grad()
        actor_loss.backward()
        self.opt_actor.step()

        # update target network
        self.soft_update(self.critic_target, self.critic)
        self.soft_update(self.actor_target, self.actor)

        # update variance
        self.var = max(self.var * self.var_decay, self.var_min)

    def soft_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)
Ejemplo n.º 4
0
class PPO:
    def __init__(self, s_dim, a_dim, bound, hidden, device, lr, memory_len,
                 batch_size, update_epoch, gamma, lambda_, epsilon):
        # Parameter initialization
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.bound = bound
        self.hidden = hidden
        self.device = torch.device(
            device if torch.cuda.is_available() else 'cpu')
        self.lr = lr
        self.memory_len = memory_len
        self.batch_size = batch_size
        self.update_epoch = update_epoch
        self.gamma = gamma
        self.lambda_ = lambda_
        self.epsilon = epsilon

        # network initialization
        self.actor = Actor(s_dim, a_dim, hidden).to(self.device)
        self.actor_old = Actor(s_dim, a_dim, hidden).to(self.device)
        self.actor_old.load_state_dict(self.actor.state_dict())
        self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr=self.lr)
        self.critic = Critic(s_dim).to(self.device)
        self.critic_opt = torch.optim.Adam(self.critic.parameters(),
                                           lr=self.lr)

        # memory initialization
        self.memory_s, self.memory_a, self.memory_s_, self.memory_r, self.memory_done = [], [], [], [], []

    def get_action(self, s):
        # select action w.r.t the actions prob
        s = torch.tensor(s, dtype=torch.float, device=self.device)
        mean, std = self.actor(s)
        cov = torch.diag_embed(std)
        dist = MultivariateNormal(loc=mean, covariance_matrix=cov)
        a = dist.sample()
        a = torch.clamp(a * self.bound, -self.bound, self.bound)
        # Because in this environment, action_dim equals 1, we use .item().
        # When action_dim>1, please use .unmpy()
        return a.item()

    def learn(self, s, a, s_, r, done):
        # store transition
        self.memory_s.append(s)
        self.memory_a.append(a / self.bound)
        self.memory_s_.append(s_)
        self.memory_r.append(r)
        self.memory_done.append(1 if done else 0)
        if len(self.memory_r) == self.memory_len:
            # prepare of data
            s = torch.tensor(self.memory_s,
                             dtype=torch.float,
                             device=self.device)  # [memory_len, s_dim]
            a = torch.tensor(self.memory_a,
                             dtype=torch.float,
                             device=self.device).unsqueeze(
                                 dim=-1)  # [memory_len, 1(a_dim)]
            r = torch.tensor(self.memory_r,
                             dtype=torch.float,
                             device=self.device)  # [memory_len]
            s_ = torch.tensor(self.memory_s_,
                              dtype=torch.float,
                              device=self.device)  # [memory_len, s_dim]
            gae = self._gae(s, r, s_, self.memory_done)
            r = self._discounted_r(r, s_, self.memory_done)

            # calculate old log probability
            self.actor_old.load_state_dict(self.actor.state_dict())
            old_log_prob = self._log_prob(s, a, old=True)  # [memory_len, 1]

            # batch update the network
            for i in range(self.update_epoch):
                for index in range(0, self.memory_len, self.batch_size):
                    self.update_actor(
                        s[index:index + self.batch_size],
                        a[index:index + self.batch_size],
                        gae[index:index + self.batch_size],
                        old_log_prob[index:index + self.batch_size])
                    self.update_critic(s[index:index + self.batch_size],
                                       r[index:index + self.batch_size])
            # empty the memory
            self.memory_s, self.memory_a, self.memory_s_, self.memory_r, self.memory_done = [], [], [], [], []

    def _log_prob(self, s, a, old=False):
        # calculate the log probability
        if old:
            with torch.no_grad():
                mean, std = self.actor_old(s)
        else:
            mean, std = self.actor(s)
        std = torch.stack([std] * mean.shape[0], dim=0)

        cov = torch.diag_embed(std)
        dist = MultivariateNormal(loc=mean, covariance_matrix=cov)
        log_prob = dist.log_prob(a).unsqueeze(dim=-1)
        return log_prob

    def _gae(self, s, r, s_, done):
        # calculate the general advantage estimation
        with torch.no_grad():
            v = self.critic(s).squeeze()  # [memory_len]
            v_ = self.critic(s_).squeeze()  # [memory_len]
            delta = r + self.gamma * v_ - v

            length = r.shape[0]
            gae = torch.zeros(size=[length])
            running_add = 0
            for t in range(length - 1, -1, -1):
                gae[t] = running_add * self.gamma * self.lambda_ * (
                    1 - done[t]) + delta[t]
                running_add = gae[t]
            return torch.unsqueeze(gae, dim=-1)

    def _discounted_r(self, r, s_, done):
        # calculate the discounted reward
        with torch.no_grad():
            length = len(r)
            discounted_r = torch.zeros(size=[length])
            v_ = self.critic(s_)
            running_add = 0
            for t in range(length - 1, -1, -1):
                if done[t] == 1 or t == length - 1:
                    discounted_r[t] = v_[t] * self.gamma + r[t]
                else:
                    discounted_r[t] = running_add * self.gamma + r[t]
                # discounted_r[t] = running_add * self.gamma + r[t]
                running_add = discounted_r[t]
        return discounted_r.unsqueeze(dim=-1)

    def _entropy(self, s, a):
        mean, std = self.actor(s)
        std = torch.stack([std] * mean.shape[0], dim=0)

        cov = torch.diag_embed(std)
        dist = MultivariateNormal(loc=mean, covariance_matrix=cov)
        entropy = dist.entropy()
        return entropy

    def update_actor(self, s, a, gae, old_log_prob):
        # calculate the actor loss
        log_prob = self._log_prob(s, a)
        ratio = torch.exp(log_prob - old_log_prob)
        surr1 = ratio * gae
        surr2 = torch.clamp(ratio, 1.0 - self.epsilon,
                            1.0 + self.epsilon) * gae
        loss = -torch.mean(torch.min(surr1, surr2))
        # loss = loss - 0.001 * self.actor.entropy() # 这个entropy项,在这个任务当中,不加为好。
        # update
        self.actor_opt.zero_grad()
        loss.backward()
        self.actor_opt.step()

    def update_critic(self, s, r):
        # calculate critic loss
        v = self.critic(s)
        loss = F.mse_loss(v, r)
        # update
        self.critic_opt.zero_grad()
        loss.backward()
        self.critic_opt.step()
Ejemplo n.º 5
0
class DDPG:
    def __init__(self,
                 path,
                 s_dim = 3,           # 状态空间维度,
                 a_dim = 1,            # 行动空间维度,
                 hidden = 64,          # 隐藏层宽度,
                 device = 'gpu',       # 训练位置,
                 capacity = 2e3,       # 记忆库大小
                 batch_size= 256,      # 训练批次大小,
                 start_lr_step = 512,  # 开始学习的时间
                 gamma=0.9,            # 回报折现率,
                 var_init = 1.,        # variance的初始值
                 var_decay = 0.9999,   # variance的衰减值
                 var_min = 0.1,        # variance的最小值
                 actor_lr = 1e-3,      # actor学习率,
                 critic_lr = 3e-4,     # critic学习率,
                 actor_tau = 0.1,      # actor更新率,
                 critic_tau = 0.2,     # critic更新率
    ):
        # 初始化所有需要的参数
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.hidden = hidden
        # 因为我目前的测试机,无法使用gpu,所以gpu训练以后再加
        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
        self.capacity = capacity
        self.batch_size = batch_size
        self.start_lr_step = start_lr_step
        self.gamma = gamma
        self.var = var_init
        self.var_decay = var_decay
        self.var_min = var_min
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.actor_tau = actor_tau
        self.critic_tau = critic_tau
        # 还没有使用
        self.path = path
        self.counter = 0

        # 初始化网络
        self.actor = Actor(s_dim, a_dim, hidden)
        self.actor_target = Actor(s_dim, a_dim, hidden)
        self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr=self.actor_lr)
        self.critic = Critic(s_dim, a_dim, hidden)
        self.critic_target = Critic(s_dim, a_dim, hidden)
        self.critic_opt = torch.optim.Adam(self.critic.parameters(), lr=self.critic_lr)

        # 初始化记忆库
        self.memory = Memory(capacity, batch_size, self.device)

        # 是否继承以前的成果
        if not os.listdir(self.path + '/Net'):
            # 没有以前的东西可以继承
            print('init completed')
            self.actor_target.load_state_dict(self.actor.state_dict())
            self.critic_target.load_state_dict(self.critic.state_dict())
        else:
            # 继承以前的网络与记忆
            print('loading completed')
            self.actor.load_state_dict(torch.load(self.path + '/Net/Actor.pth'))
            self.actor_target.load_state_dict(torch.load(self.path + '/Net/Actor_Target.pth'))
            self.critic.load_state_dict(torch.load(self.path + '/Net/Critic.pth'))
            self.critic_target.load_state_dict(torch.load(self.path + '/Net/Critic_Target.pth'))
            with open(self.path + '/Net/Memory.json', 'r') as f:
                self.memory.memory = json.load(f)
            with open(self.path + '/Net/Counter.json', 'r') as f:
                self.memory.counter = json.load(f)
            with open(self.path + '/Net/Var.json', 'r') as f:
                self.var = json.load(f)

    def choose_action(self, s):
        with torch.no_grad():
            s = torch.tensor(s, dtype=torch.float)
            a = self.actor(s).numpy()
        a = np.clip(np.random.normal(loc=a, scale=self.var), -1., 1.)
        # 行动:仅为pitch_pos
        return a

    def store_transition(self, s, a, s_, r, done):
        # 向记忆库中存储经历
        self.memory.store_transition(s, a, s_, r, done)
        if self.memory.counter >= self.start_lr_step:
            s, a, s_, r, done = self.memory.get_sample()
            self._learn(s, a, s_, r, done)

    def store_network(self):
        # print('I stored actor in:', self.path+'/Net/Actor.pth')
        torch.save(self.actor.state_dict(), self.path + '/Net/Actor.pth')
        torch.save(self.actor_target.state_dict(), self.path + '/Net/Actor_Target.pth')
        torch.save(self.critic.state_dict(), self.path + '/Net/Critic.pth')
        torch.save(self.critic_target.state_dict(), self.path + '/Net/Critic_Target.pth')
        with open(self.path + '/Net/Memory.json', 'w') as f:
            json.dump(self.memory.memory, f)
        with open(self.path + '/Net/Counter.json', 'w') as f:
            json.dump(self.memory.counter, f)
        with open(self.path + '/Net/Var.json', 'w') as f:
            json.dump(self.var, f)

        print(self.var, self.memory.counter)

    def _learn(self, s, a, s_, r, done):
        # 更新critic
        td_target = r + (1-done) * self.gamma * self.critic_target(s_, self.actor_target(s_))
        q = self.critic(s, a)
        critic_loss = F.mse_loss(q, td_target)
        self.critic_opt.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

        # 更新actor
        q = self.critic(s, self.actor(s))
        actor_loss = -torch.mean(q)
        self.actor_opt.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()

        # 更新target网络
        _soft_update(self.critic_target, self.critic, self.critic_tau)
        _soft_update(self.actor_target, self.actor, self.actor_tau)

        # update variance
        self.var = max(self.var * self.var_decay, self.var_min)