예제 #1
0
class ActorCritic:
    def __init__(
            self,
            s_dim,
            a_num,
            device,
            hidden,
            lr_actor,
            lr_critic,
            gamma,
    ):
        # Parameter Initialization
        self.s_dim = s_dim
        self.a_num = a_num
        self.device = device
        self.hidden = hidden
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.gamma = gamma

        # network initialization
        self.actor = Actor(s_dim, hidden, a_num).to(self.device)
        self.opt_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic = Critic(s_dim, hidden).to(self.device)
        self.opt_critic = torch.optim.Adam(self.critic.parameters(), lr=lr_critic)
        # no memory in this algorithm

    def get_action(self, s):
        s = torch.FloatTensor(s).to(self.device)
        prob_weights = self.actor(s)
        # select action w.r.t the actions prob
        dist = Categorical(prob_weights)
        action = (dist.sample()).detach().item()
        return action

    def learn(self, s, a, s_, r, done):
        done = 1 if done else 0
        # torch.LongTensor torch.FloatTensor only work for list
        # when transform scalar to Tensor, we could use torch.tensor()
        s = torch.tensor(s, dtype=torch.float, device=self.device)
        a = torch.tensor(a, dtype=torch.long, device=self.device)
        s_ = torch.tensor(s_, dtype=torch.float, device=self.device)
        r = torch.tensor(r, dtype=torch.float, device=self.device)
        # update for critic
        v = self.critic(s)
        with torch.no_grad():
            v_ = self.critic(s_)
            td_target = r + (1-done)*self.gamma*v_.detach()
            td_error = td_target - v
        critic_loss = F.mse_loss(v, td_target)
        self.opt_critic.zero_grad()
        critic_loss.backward()
        self.opt_critic.step()
        # update for actor
        prob = self.actor(s)
        dist = Categorical(prob)
        actor_loss = -td_error * dist.log_prob(a)
        self.opt_actor.zero_grad()
        actor_loss.backward()
        self.opt_actor.step()
예제 #2
0
class TD3:
    def __init__(self, s_dim, a_dim, capacity, batch_size, lr_actor, lr_critic,
                 alpha, beta, p_with_pi, hidden, reg_coe, var_init, var_decay,
                 var_min, gamma, tau, policy_noise, noise_clip, policy_freq):
        # Parameter Initialization
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.alpha = alpha
        self.beta = beta
        self.p_with_pi = p_with_pi
        self.hidden = hidden
        self.reg_coe = reg_coe
        self.capacity = capacity
        self.batch_size = batch_size
        self.var = var_init
        self.var_decay = var_decay
        self.var_min = var_min
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq
        self.train_it = 0

        # Network
        self.actor = Actor(s_dim, a_dim, hidden)
        self.actor_target = copy.deepcopy(self.actor)
        self.opt_actor = torch.optim.Adam(self.actor.parameters(),
                                          lr=lr_actor,
                                          weight_decay=reg_coe)

        self.critic = Critic(s_dim, a_dim, hidden)
        self.critic_target = copy.deepcopy(self.critic)
        self.opt_critic = torch.optim.Adam(self.critic.parameters(),
                                           lr=lr_critic,
                                           weight_decay=reg_coe)

        # replay buffer, or memory
        self.memory = PER(capacity, batch_size, alpha, beta)

    def get_action(self, s):
        with torch.no_grad():
            a = self.actor(torch.FloatTensor(s))
        #  add randomness to action selection for exploration
        a = a.numpy()
        a = np.clip(np.random.normal(a, self.var), -1., 1.)
        return a

    def learn(self):
        self.train_it += 1
        s, a, s_, r, done, weight, samples_index = self.memory.get_sample()

        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = torch.clip(
                torch.randn_like(a) * self.policy_noise, -self.noise_clip,
                self.noise_clip)
            a_ = torch.clip(self.actor_target(s_) + noise, -1., 1.)
            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(s_, a_)
            target_Q = torch.min(target_Q1, target_Q2)
            td_target = r + (1 - done) * self.gamma * target_Q

        # update critic
        q1, q2 = self.critic(s, a)
        td_error = (q1 - td_target)**2 + (q2 - td_target)**2
        # critic_loss = F.mse_loss(q1, td_target) + F.mse_loss(q2, td_target)
        critic_loss = torch.mean(td_error)
        self.opt_critic.zero_grad()
        critic_loss.backward()
        self.opt_critic.step()

        if not self.p_with_pi:
            new_priority = torch.abs(td_error.squeeze()).detach().numpy() + \
                           (np.e ** -10)  # + (np.e ** -10))**self.memory.alpha
            self.memory.priority[samples_index] = new_priority

        if self.train_it % self.policy_freq == 0:
            # update actor
            q = self.critic.Q1(s, self.actor(s))
            actor_loss = -torch.mean(q)
            self.opt_actor.zero_grad()
            actor_loss.backward()
            self.opt_actor.step()
            if self.p_with_pi:
                new_priority = torch.abs(td_error.squeeze()).detach().numpy() + \
                               torch.pow(q.squeeze(), 2).detach().numpy() + \
                               (np.e ** -10)  # + (np.e ** -10))**self.memory.alpha
                self.memory.priority[samples_index] = new_priority

            # update target network
            self.soft_update(self.critic_target, self.critic)
            self.soft_update(self.actor_target, self.actor)

            # update varaiance
            self.var = max(self.var * self.var_decay, self.var_min)

    def soft_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)
예제 #3
0
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
    
    def step(self, state, action, reward, next_state):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next)
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
예제 #4
0
class ActorCritic:
    def __init__(
            self,
            s_dim,
            a_num,
            device,
            hidden,
            lr_actor,
            lr_critic,
            memory_len,
            gamma,
            lambda_,
    ):
        # Parameter Initialization
        self.s_dim = s_dim
        self.a_num = a_num
        self.device = device
        self.hidden = hidden
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.memory_len = memory_len
        self.gamma = gamma
        self.lambda_ = lambda_

        # network initialization
        self.actor = Actor(s_dim, hidden, a_num).to(self.device)
        self.opt_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic = Critic(s_dim, hidden).to(self.device)
        self.opt_critic = torch.optim.Adam(self.critic.parameters(), lr=lr_critic)

        # no memory in this algorithm
        self.memory_s = []
        self.memory_a = []
        self.memory_s_ = []
        self.memory_r = []
        self.memory_done = []

    def get_action(self, s):
        s = torch.FloatTensor(s).to(self.device)
        prob_weights = self.actor(s)
        # select action w.r.t the actions prob
        dist = Categorical(prob_weights)
        action = (dist.sample()).detach().item()
        return action

    def store_transition(self, s, a, s_, r, done):
        self.memory_s.append(s)
        self.memory_a.append(a)
        self.memory_s_.append(s_)
        self.memory_r.append(r)
        self.memory_done.append(1 if done else 0)
        if len(self.memory_r)>self.memory_len:
            self._learn()

    def _GAE(self, s, r, s_, done):
        with torch.no_grad():
            v = self.critic(s).squeeze()
            v_ = self.critic(s_).squeeze()
            delta = r + self.gamma*v_*(1-done) - v

            length = r.shape[0]
            GAE = torch.zeros(size=[length], device=self.device)
            running_add = 0
            for t in range(length - 1, -1, -1):
                running_add = delta[t] + running_add * \
                              self.gamma * self.lambda_ * (1 - done[t])
                GAE[t] = running_add
            return GAE

    def _discounted_r(self, r, done):
        length = r.shape[0]
        discounted_r = torch.zeros([length], device=self.device)
        running_add = 0
        for t in range(length - 1, -1, -1):
            running_add = running_add * self.gamma * (1 - done[t]) + r[t]
            discounted_r[t] = running_add
        return discounted_r


    def _learn(self):
        # torch.LongTensor torch.FloatTensor only work for list
        # when transform scalar to Tensor, we could use torch.tensor()

        s = torch.tensor(self.memory_s, dtype=torch.float).to(self.device)
        a = torch.tensor(self.memory_a, dtype=torch.long).to(self.device)
        s_ = torch.tensor(self.memory_s_, dtype=torch.float).to(self.device)
        r = torch.tensor(self.memory_r, dtype=torch.float).to(self.device)
        done = torch.tensor(self.memory_done, dtype=torch.float).to(self.device)
        GAE = self._GAE(s, r, s_, done)
        discounted_r = self._discounted_r(r, done)

        # update for critic
        v = self.critic(s).squeeze()
        critic_loss = F.mse_loss(v, discounted_r)
        self.opt_critic.zero_grad()
        critic_loss.backward()
        self.opt_critic.step()
        # update for actor
        prob = self.actor(s)
        dist = Categorical(prob)
        actor_loss = -torch.sum(GAE.detach()*dist.log_prob(a))
        self.opt_actor.zero_grad()
        actor_loss.backward()
        self.opt_actor.step()

        self.memory_s = []
        self.memory_a = []
        self.memory_s_ = []
        self.memory_r = []
        self.memory_done = []
예제 #5
0
파일: DDPG.py 프로젝트: zoetsekas/rl_lib
class DDPG:
    def __init__(self, state_space, action_space):
        self.actor = Actor(state_space, action_space).to(device)
        self.critic = Critic(state_space, action_space).to(device)

        self.actor_target = Actor(state_space, action_space).to(device)
        self.critic_target = Critic(state_space, action_space).to(device)

        self.actor_optimiser = optim.Adam(actor.parameters(), lr=1e-3)
        self.critic_optimiser = optim.Adam(critic.parameters(), lr=1e-3)

        self.mem = ReplayBuffer(buffer_size)

    def act(self, state, add_noise=False):
        return self.actor.act(state, add_noise)

    def save(self, fn):
        torch.save(self.actor.state_dict(), "{}_actor_model.pth".format(fn))
        torch.save(self.critic.state_dict(), "{}_critic_model.pth".format(fn))

    def learn(self):

        state_batch, action_batch, reward_batch, next_state_batch, masks = self.mem.sample(
            batch_size)

        state_batch = torch.FloatTensor(state_batch).to(device)
        action_batch = torch.FloatTensor(action_batch).to(device)
        reward_batch = torch.FloatTensor(reward_batch).to(device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(device)
        masks = torch.FloatTensor(masks).to(device)

        # Update Critic
        self.update_critic(states=state_batch,
                           next_states=next_state_batch,
                           actions=action_batch,
                           rewards=reward_batch,
                           dones=masks)

        # Update actor
        self.update_actor(states=state_batch)

        # Update target networks
        self.update_target_networks()

    def update_actor(self, states):
        actions_pred = self.actor(states)
        loss = -self.critic(states, actions_pred).mean()

        self.actor_optimiser.zero_grad()
        loss.backward()
        self.actor_optimiser.step()

    def update_critic(self, states, next_states, actions, rewards, dones):
        next_actions = self.actor_target.forward(next_states)

        y_i = rewards + (gamma *
                         self.critic_target(next_states, next_actions) *
                         (1 - dones))
        expected_Q = self.critic(states, actions)

        loss = F.mse_loss(y_i, expected_Q)

        self.critic_optimiser.zero_grad()
        loss.backward()
        self.critic_optimiser.step()

    def update_target_networks(self):
        for target, local in zip(self.actor_target.parameters(),
                                 self.actor.parameters()):
            target.data.copy_(tau * local.data + (1.0 - tau) * target.data)

        for target, local in zip(self.critic_target.parameters(),
                                 self.critic.parameters()):
            target.data.copy_(tau * local.data + (1.0 - tau) * target.data)
예제 #6
0
class TD3:
    def __init__(self, s_dim, a_dim, capacity, batch_size, lr_actor, lr_critic,
                 hidden, var_init, var_decay, var_min, gamma, tau,
                 policy_noise, noise_clip, policy_freq):
        # Parameter Initialization
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.hidden = hidden
        self.capacity = capacity
        self.batch_size = batch_size
        self.var = var_init
        self.var_decay = var_decay
        self.var_min = var_min
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq
        self.train_it = 0

        # Network
        self.actor = Actor(s_dim, a_dim, hidden)
        self.actor_target = copy.deepcopy(self.actor)
        self.opt_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_actor)

        self.critic = Critic(s_dim, a_dim, hidden)
        self.critic_target = copy.deepcopy(self.critic)
        self.opt_critic = torch.optim.Adam(self.critic.parameters(),
                                           lr=lr_critic)

        # replay buffer, or memory
        self.memory = ReplayBuffer(capacity, batch_size)

    def get_action(self, s):
        with torch.no_grad():
            a = self.actor(torch.FloatTensor(s))
        #  add randomness to action selection for exploration
        a = a.numpy()
        a = np.clip(np.random.normal(a, self.var), -1., 1.)
        return a

    def learn(self):
        self.train_it += 1
        s, a, s_, r, done = self.memory.get_sample()

        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = torch.randn_like(a) * self.policy_noise
            noise = torch.clip(noise, -self.noise_clip, self.noise_clip)

            a_ = self.actor_target(s_) + noise
            a_ = torch.clip(a_, -1., 1.)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(s_, a_)
            target_Q = torch.min(target_Q1, target_Q2)
            td_target = r + (1 - done) * self.gamma * target_Q

        # update critic
        q1, q2 = self.critic(s, a)
        critic_loss = F.mse_loss(q1, td_target) + F.mse_loss(q2, td_target)
        self.opt_critic.zero_grad()
        critic_loss.backward()
        self.opt_critic.step()

        if self.train_it % self.policy_freq == 0:
            # update actor
            # 两种写法都是可行的,可以直接用一个,也可以取min
            q1, q2 = self.critic(s, self.actor(s))
            q = torch.min(q1, q2)
            # q = self.critic.Q1(s, self.actor(s))
            actor_loss = -torch.mean(q)
            self.opt_actor.zero_grad()
            actor_loss.backward()
            self.opt_actor.step()

            # update target network
            self.soft_update(self.critic_target, self.critic)
            self.soft_update(self.actor_target, self.actor)

            # update varaiance
            self.var = max(self.var * self.var_decay, self.var_min)

    def soft_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)
예제 #7
0
class PPO:
    def __init__(self,
                 path,
                 s_dim=3,
                 a_dim=1,
                 hidden=64,
                 actor_lr=1e-4,
                 critic_lr=1e-4,
                 memory_len=64,
                 batch_size=32,
                 update_epoch=10,
                 gamma=0.9,
                 lambda_=0.95,
                 epsilon=0.2):
        # Parameter initialization
        self.path = path
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.hidden = hidden
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.memory_len = memory_len
        self.batch_size = batch_size
        self.update_epoch = update_epoch
        self.gamma = gamma
        self.lambda_ = lambda_
        self.epsilon = epsilon

        # network initialization
        self.actor = Actor(s_dim, a_dim, hidden)
        self.actor_old = Actor(s_dim, a_dim, hidden)
        self.actor_opt = torch.optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)
        self.critic = Critic(s_dim, hidden)
        self.critic_opt = torch.optim.Adam(self.critic.parameters(),
                                           lr=self.critic_lr)

        # memory initialization
        self.memory_s, self.memory_a, self.memory_s_, self.memory_r, self.memory_done = [], [], [], [], []

        # 是否继承以前的成果
        if not os.listdir(self.path + '/Net'):
            # 没有以前的东西可以继承
            print('init completed')
        else:
            # 继承以前的网络与记忆
            print('loading completed')
            self.actor.load_state_dict(torch.load(self.path +
                                                  '/Net/Actor.pth'))
            self.critic.load_state_dict(
                torch.load(self.path + '/Net/Critic.pth'))
            with open(self.path + '/Net/Memory_s.json', 'r') as f:
                self.memory_s = json.load(f)
            with open(self.path + '/Net/Memory_a.json', 'r') as f:
                self.memory_a = json.load(f)
            with open(self.path + '/Net/Memory_s_.json', 'r') as f:
                self.memory_s_ = json.load(f)
            with open(self.path + '/Net/Memory_r.json', 'r') as f:
                self.memory_r = json.load(f)
            with open(self.path + '/Net/Memory_done.json', 'r') as f:
                self.memory_done = json.load(f)
        self.actor_old.load_state_dict(self.actor.state_dict())

    def store_network(self):
        torch.save(self.actor.state_dict(), self.path + '/Net/Actor.pth')
        torch.save(self.critic.state_dict(), self.path + '/Net/Critic.pth')
        with open(self.path + '/Net/Memory_s.json', 'w') as f:
            json.dump(self.memory_s, f)
        with open(self.path + '/Net/Memory_a.json', 'w') as f:
            json.dump(self.memory_a, f)
        with open(self.path + '/Net/Memory_s_.json', 'w') as f:
            json.dump(self.memory_s_, f)
        with open(self.path + '/Net/Memory_r.json', 'w') as f:
            json.dump(self.memory_r, f)
        with open(self.path + '/Net/Memory_done.json', 'w') as f:
            json.dump(self.memory_done, f)

    def choose_action(self, s):
        with torch.no_grad():
            s = torch.tensor(s, dtype=torch.float)
            mean, std = self.actor(s)
            cov = torch.diag_embed(std)
            dist = MultivariateNormal(loc=mean, covariance_matrix=cov)
            a = dist.sample()
            a = torch.clamp(a, -1., 1.).numpy().tolist()
        return a

    def store_transition(self, s, a, s_, r, done):
        # store transition
        self.memory_s.append(s)
        self.memory_a.append(a)
        self.memory_s_.append(s_)
        self.memory_r.append(r)
        self.memory_done.append(1 if done else 0)
        if len(self.memory_r) == self.memory_len:
            # prepare of data
            s = torch.tensor(self.memory_s,
                             dtype=torch.float)  # [memory_len, s_dim]
            a = torch.tensor(self.memory_a,
                             dtype=torch.float)  # [memory_len, 1(a_dim)]
            r = torch.tensor(self.memory_r, dtype=torch.float)  # [memory_len]
            s_ = torch.tensor(self.memory_s_,
                              dtype=torch.float)  # [memory_len, s_dim]
            done = torch.tensor(self.memory_done,
                                dtype=torch.float)  # [memory_len]
            self._learn(s, a, s_, r, done)

    def _learn(self, s, a, s_, r, done):
        gae = self._gae(s, r, s_, done)  # [memory_len, 1]
        r = self._discounted_r(r, s_, done)  # [memory_len, 1]

        # calculate old log probability
        self.actor_old.load_state_dict(self.actor.state_dict())
        old_log_prob = self._log_prob(s, a, old=True)  # [memory_len, 1]

        # batch update the network
        for i in range(self.update_epoch):
            for index in range(0, self.memory_len, self.batch_size):
                self.update_actor(s[index:index + self.batch_size],
                                  a[index:index + self.batch_size],
                                  gae[index:index + self.batch_size],
                                  old_log_prob[index:index + self.batch_size])
                self.update_critic(s[index:index + self.batch_size],
                                   r[index:index + self.batch_size])
        # empty the memory
        self.memory_s, self.memory_a, self.memory_s_, self.memory_r, self.memory_done = [], [], [], [], []

    def _log_prob(self, s, a, old=False):
        # calculate the log probability
        if old:
            with torch.no_grad():
                mean, std = self.actor_old(s)
        else:
            mean, std = self.actor(s)
        std = torch.stack([std] * mean.shape[0], dim=0)

        cov = torch.diag_embed(std)
        dist = MultivariateNormal(loc=mean, covariance_matrix=cov)
        log_prob = dist.log_prob(a).unsqueeze(dim=-1)
        return log_prob

    def _gae(self, s, r, s_, done):
        # calculate the general advantage estimation
        with torch.no_grad():
            v = self.critic(s).squeeze()  # [memory_len]
            v_ = self.critic(s_).squeeze()  # [memory_len]
            delta = r + self.gamma * v_ - v

            length = r.shape[0]
            gae = torch.zeros(size=[length])
            running_add = 0
            for t in range(length - 1, -1, -1):
                gae[t] = running_add * self.gamma * self.lambda_ * (
                    1 - done[t]) + delta[t]
                running_add = gae[t]
            return torch.unsqueeze(gae, dim=-1)

    def _discounted_r(self, r, s_, done):
        # calculate the discounted reward
        with torch.no_grad():
            length = len(r)
            discounted_r = torch.zeros(size=[length])
            v_ = self.critic(s_)
            running_add = 0
            for t in range(length - 1, -1, -1):
                if done[t] == 1 or t == length - 1:
                    discounted_r[t] = v_[t] * self.gamma + r[t]
                else:
                    discounted_r[t] = running_add * self.gamma + r[t]
                running_add = discounted_r[t]
        return discounted_r.unsqueeze(dim=-1)

    def update_actor(self, s, a, gae, old_log_prob):
        # calculate the actor loss
        log_prob = self._log_prob(s, a)
        ratio = torch.exp(log_prob - old_log_prob)
        surr1 = ratio * gae
        surr2 = torch.clamp(ratio, 1.0 - self.epsilon,
                            1.0 + self.epsilon) * gae
        loss = -torch.mean(torch.min(surr1, surr2))
        loss = loss - 0.001 * self.actor.log_std  # 这个任务当中,加入PPO是有效果的。
        # update
        self.actor_opt.zero_grad()
        loss.backward()
        self.actor_opt.step()

    def update_critic(self, s, r):
        # calculate critic loss
        v = self.critic(s)
        loss = F.mse_loss(v, r)
        # update
        self.critic_opt.zero_grad()
        loss.backward()
        self.critic_opt.step()
예제 #8
0
class A2C:
    def __init__(
        self,
        s_dim,
        a_num,
        device,
        hidden,
        lr_actor,
        lr_critic,
        max_len,
        gamma,
    ):
        # Parameter Initialization
        self.s_dim = s_dim
        self.a_num = a_num
        self.device = device
        self.hidden = hidden
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.max_len = max_len
        self.gamma = gamma

        # network initialization
        self.actor = Actor(s_dim, hidden, a_num).to(self.device)
        self.opt_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic = Critic(s_dim, hidden).to(self.device)
        self.opt_critic = torch.optim.Adam(self.critic.parameters(),
                                           lr=lr_critic)

        # define memory
        self.memory_s = []
        self.memory_a = []
        self.memory_r = []

    def get_action(self, s):
        s = torch.FloatTensor(s).to(self.device)
        prob_weights = self.actor(s)
        # select action w.r.t the actions prob
        dist = Categorical(prob_weights)
        action = (dist.sample()).detach().item()
        return action

    def store_transition(self, s, a, s_, r, done):
        self.memory_s.append(s)
        self.memory_a.append(a)
        self.memory_r.append(r)
        if len(self.memory_r) >= self.max_len or done:
            discounted_r = self._discounted_r(self.memory_r, s_, done)
            s = torch.FloatTensor(self.memory_s).to(self.device)
            a = torch.LongTensor(self.memory_a).to(self.device)
            r = torch.FloatTensor(discounted_r).to(self.device)
            self._learn(s, a, r)

    def _learn(self, s, a, r):
        # update critic
        v = self.critic(s)
        advantage = r - v
        critic_loss = torch.mean(torch.pow(advantage, 2))
        self.opt_critic.zero_grad()
        critic_loss.backward()
        self.opt_critic.step()
        # update actor
        prob = self.actor(s)
        dist = Categorical(prob)
        log_prob = dist.log_prob(a)
        actor_loss = -torch.mean(log_prob * advantage.detach())
        self.opt_actor.zero_grad()
        actor_loss.backward()
        self.opt_actor.step()
        # renew the memory
        self.memory_s = []
        self.memory_a = []
        self.memory_r = []

    def _discounted_r(self, r, s_, done):
        length = len(r)
        discounted_r = np.zeros(length)
        running_add = 0 if done else self.critic(
            torch.FloatTensor(s_).to(self.device)).item()
        for t in range(length - 1, -1, -1):
            running_add = r[t] + running_add * self.gamma
            discounted_r[t] = running_add
        return discounted_r
예제 #9
0
class SAC:
    def __init__(self, s_dim, a_dim, hidden, capacity, batch_size, lr, gamma,
                 tau, log_prob_reg):
        # Parameter Initialization
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.hidden = hidden
        self.lr = lr
        self.capacity = capacity
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.log_prob_reg = log_prob_reg

        # Network
        self.actor = Actor(s_dim, a_dim, hidden)
        self.opt_actor = torch.optim.Adam(self.actor.parameters(), lr=lr)
        self.critic = Critic(s_dim, a_dim, hidden)
        self.critic_target = copy.deepcopy(self.critic)
        self.opt_critic = torch.optim.Adam(self.critic.parameters(), lr=lr)
        # alpha
        self.target_entropy = -a_dim
        self.alpha = torch.tensor(1, dtype=torch.float, requires_grad=True)
        self.opt_alpha = torch.optim.Adam([self.alpha], lr=lr)
        # replay buffer, memory
        self.memory = ReplayBuffer(capacity, batch_size)

    def get_action(self, s):
        s = torch.tensor(data=s, dtype=torch.float)
        mean, std = self.actor(s)
        normal = Normal(mean, std)
        z = normal.rsample()
        a = torch.tanh(z)
        return a.detach().numpy().tolist()

    def _log_prob(self, s):
        mean, std = self.actor(s)
        dist = Normal(mean, std)
        u = dist.rsample()
        a = torch.tanh(u)
        log_prob = dist.log_prob(u) - torch.log(1 - a.pow(2) +
                                                self.log_prob_reg)
        log_prob = log_prob.sum(-1, keepdim=True)
        return a, log_prob

    def learn(self):
        # samples from memory
        s, a, s_, r = self.memory.get_sample()
        # update q net
        with torch.no_grad():
            a_, log_prob_ = self._log_prob(s_)
            q1_, q2_ = self.critic_target(s_, a_)
            q_target = r + self.gamma * (torch.min(q1_, q2_) -
                                         self.alpha * log_prob_)
        q1, q2 = self.critic(s, a)
        q_loss = F.mse_loss(q1, q_target) + F.mse_loss(q2, q_target)
        self.opt_critic.zero_grad()
        q_loss.backward()
        self.opt_critic.step()
        # update policy net
        a_new, log_prob_new = self._log_prob(s)
        q_new = self.critic.Q1(s, a_new)
        # q1_new, q2_new = self.critic(s, a_new)
        # q_new = torch.min(q1_new, q2_new) 这两种做法都可行
        policy_loss = torch.mean(self.alpha * log_prob_new - q_new)
        self.opt_actor.zero_grad()
        policy_loss.backward()
        self.opt_actor.step()
        # update temperature alpha
        alpha_loss = -torch.mean(self.alpha *
                                 (log_prob_new + self.target_entropy).detach())
        self.opt_alpha.zero_grad()
        alpha_loss.backward()
        self.opt_alpha.step()
        # update target net
        self.soft_update(self.critic_target, self.critic)

    def soft_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)
예제 #10
0
class PPO:
    def __init__(self, s_dim, a_dim, bound, hidden, device, lr, memory_len,
                 batch_size, update_epoch, gamma, lambda_, epsilon):
        # Parameter initialization
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.bound = bound
        self.hidden = hidden
        self.device = torch.device(
            device if torch.cuda.is_available() else 'cpu')
        self.lr = lr
        self.memory_len = memory_len
        self.batch_size = batch_size
        self.update_epoch = update_epoch
        self.gamma = gamma
        self.lambda_ = lambda_
        self.epsilon = epsilon

        # network initialization
        self.actor = Actor(s_dim, a_dim, hidden).to(self.device)
        self.actor_old = Actor(s_dim, a_dim, hidden).to(self.device)
        self.actor_old.load_state_dict(self.actor.state_dict())
        self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr=self.lr)
        self.critic = Critic(s_dim).to(self.device)
        self.critic_opt = torch.optim.Adam(self.critic.parameters(),
                                           lr=self.lr)

        # memory initialization
        self.memory_s, self.memory_a, self.memory_s_, self.memory_r, self.memory_done = [], [], [], [], []

    def get_action(self, s):
        # select action w.r.t the actions prob
        s = torch.tensor(s, dtype=torch.float, device=self.device)
        mean, std = self.actor(s)
        cov = torch.diag_embed(std)
        dist = MultivariateNormal(loc=mean, covariance_matrix=cov)
        a = dist.sample()
        a = torch.clamp(a * self.bound, -self.bound, self.bound)
        # Because in this environment, action_dim equals 1, we use .item().
        # When action_dim>1, please use .unmpy()
        return a.item()

    def learn(self, s, a, s_, r, done):
        # store transition
        self.memory_s.append(s)
        self.memory_a.append(a / self.bound)
        self.memory_s_.append(s_)
        self.memory_r.append(r)
        self.memory_done.append(1 if done else 0)
        if len(self.memory_r) == self.memory_len:
            # prepare of data
            s = torch.tensor(self.memory_s,
                             dtype=torch.float,
                             device=self.device)  # [memory_len, s_dim]
            a = torch.tensor(self.memory_a,
                             dtype=torch.float,
                             device=self.device).unsqueeze(
                                 dim=-1)  # [memory_len, 1(a_dim)]
            r = torch.tensor(self.memory_r,
                             dtype=torch.float,
                             device=self.device)  # [memory_len]
            s_ = torch.tensor(self.memory_s_,
                              dtype=torch.float,
                              device=self.device)  # [memory_len, s_dim]
            gae = self._gae(s, r, s_, self.memory_done)
            r = self._discounted_r(r, s_, self.memory_done)

            # calculate old log probability
            self.actor_old.load_state_dict(self.actor.state_dict())
            old_log_prob = self._log_prob(s, a, old=True)  # [memory_len, 1]

            # batch update the network
            for i in range(self.update_epoch):
                for index in range(0, self.memory_len, self.batch_size):
                    self.update_actor(
                        s[index:index + self.batch_size],
                        a[index:index + self.batch_size],
                        gae[index:index + self.batch_size],
                        old_log_prob[index:index + self.batch_size])
                    self.update_critic(s[index:index + self.batch_size],
                                       r[index:index + self.batch_size])
            # empty the memory
            self.memory_s, self.memory_a, self.memory_s_, self.memory_r, self.memory_done = [], [], [], [], []

    def _log_prob(self, s, a, old=False):
        # calculate the log probability
        if old:
            with torch.no_grad():
                mean, std = self.actor_old(s)
        else:
            mean, std = self.actor(s)
        std = torch.stack([std] * mean.shape[0], dim=0)

        cov = torch.diag_embed(std)
        dist = MultivariateNormal(loc=mean, covariance_matrix=cov)
        log_prob = dist.log_prob(a).unsqueeze(dim=-1)
        return log_prob

    def _gae(self, s, r, s_, done):
        # calculate the general advantage estimation
        with torch.no_grad():
            v = self.critic(s).squeeze()  # [memory_len]
            v_ = self.critic(s_).squeeze()  # [memory_len]
            delta = r + self.gamma * v_ - v

            length = r.shape[0]
            gae = torch.zeros(size=[length])
            running_add = 0
            for t in range(length - 1, -1, -1):
                gae[t] = running_add * self.gamma * self.lambda_ * (
                    1 - done[t]) + delta[t]
                running_add = gae[t]
            return torch.unsqueeze(gae, dim=-1)

    def _discounted_r(self, r, s_, done):
        # calculate the discounted reward
        with torch.no_grad():
            length = len(r)
            discounted_r = torch.zeros(size=[length])
            v_ = self.critic(s_)
            running_add = 0
            for t in range(length - 1, -1, -1):
                if done[t] == 1 or t == length - 1:
                    discounted_r[t] = v_[t] * self.gamma + r[t]
                else:
                    discounted_r[t] = running_add * self.gamma + r[t]
                # discounted_r[t] = running_add * self.gamma + r[t]
                running_add = discounted_r[t]
        return discounted_r.unsqueeze(dim=-1)

    def _entropy(self, s, a):
        mean, std = self.actor(s)
        std = torch.stack([std] * mean.shape[0], dim=0)

        cov = torch.diag_embed(std)
        dist = MultivariateNormal(loc=mean, covariance_matrix=cov)
        entropy = dist.entropy()
        return entropy

    def update_actor(self, s, a, gae, old_log_prob):
        # calculate the actor loss
        log_prob = self._log_prob(s, a)
        ratio = torch.exp(log_prob - old_log_prob)
        surr1 = ratio * gae
        surr2 = torch.clamp(ratio, 1.0 - self.epsilon,
                            1.0 + self.epsilon) * gae
        loss = -torch.mean(torch.min(surr1, surr2))
        # loss = loss - 0.001 * self.actor.entropy() # 这个entropy项,在这个任务当中,不加为好。
        # update
        self.actor_opt.zero_grad()
        loss.backward()
        self.actor_opt.step()

    def update_critic(self, s, r):
        # calculate critic loss
        v = self.critic(s)
        loss = F.mse_loss(v, r)
        # update
        self.critic_opt.zero_grad()
        loss.backward()
        self.critic_opt.step()
예제 #11
0
class DDPG:
    def __init__(self, s_dim, a_dim, device, hidden, capacity, batch_size,
                 lr_actor, lr_critic, variance_start, variance_decay,
                 variance_min, gamma, tau):
        # Parameter Initialization
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.device = device
        self.hidden = hidden
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.capacity = capacity
        self.batch_size = batch_size
        self.var = variance_start
        self.var_decay = variance_decay
        self.var_min = variance_min
        self.gamma = gamma
        self.tau = tau

        # Network
        self.actor = Actor(s_dim, hidden, a_dim).to(device)
        self.actor_target = Actor(s_dim, hidden, a_dim).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.opt_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic = Critic(s_dim, a_dim, hidden).to(device)
        self.critic_target = Critic(s_dim, a_dim, hidden).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.opt_critic = torch.optim.Adam(self.critic.parameters(),
                                           lr=lr_critic)

        # replay buffer, or memory
        self.memory = ReplayBuffer(capacity, batch_size, device)

    def get_action(self, s):
        with torch.no_grad():
            s = torch.FloatTensor(s).to(self.device)
            a = self.actor(s).numpy()
        a = np.clip(np.random.normal(a, self.var), -1., 1.)
        return a

    def learn(self):
        # samples from memory
        s, a, s_, r, done = self.memory.get_sample()

        # update critic
        with torch.no_grad():
            td_target = r + (1 - done) * self.gamma * self.critic_target(
                s_, self.actor_target(s_))
        q = self.critic(s, a)
        critic_loss = F.mse_loss(q, td_target)
        self.opt_critic.zero_grad()
        critic_loss.backward()
        self.opt_critic.step()

        # update actor
        q = self.critic(s, self.actor(s))
        actor_loss = -torch.mean(q)
        self.opt_actor.zero_grad()
        actor_loss.backward()
        self.opt_actor.step()

        # update target network
        self.soft_update(self.critic_target, self.critic)
        self.soft_update(self.actor_target, self.actor)

        # update variance
        self.var = max(self.var * self.var_decay, self.var_min)

    def soft_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)
예제 #12
0
class DDPG:
    def __init__(self,
                 path,
                 s_dim = 3,           # 状态空间维度,
                 a_dim = 1,            # 行动空间维度,
                 hidden = 64,          # 隐藏层宽度,
                 device = 'gpu',       # 训练位置,
                 capacity = 2e3,       # 记忆库大小
                 batch_size= 256,      # 训练批次大小,
                 start_lr_step = 512,  # 开始学习的时间
                 gamma=0.9,            # 回报折现率,
                 var_init = 1.,        # variance的初始值
                 var_decay = 0.9999,   # variance的衰减值
                 var_min = 0.1,        # variance的最小值
                 actor_lr = 1e-3,      # actor学习率,
                 critic_lr = 3e-4,     # critic学习率,
                 actor_tau = 0.1,      # actor更新率,
                 critic_tau = 0.2,     # critic更新率
    ):
        # 初始化所有需要的参数
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.hidden = hidden
        # 因为我目前的测试机,无法使用gpu,所以gpu训练以后再加
        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
        self.capacity = capacity
        self.batch_size = batch_size
        self.start_lr_step = start_lr_step
        self.gamma = gamma
        self.var = var_init
        self.var_decay = var_decay
        self.var_min = var_min
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.actor_tau = actor_tau
        self.critic_tau = critic_tau
        # 还没有使用
        self.path = path
        self.counter = 0

        # 初始化网络
        self.actor = Actor(s_dim, a_dim, hidden)
        self.actor_target = Actor(s_dim, a_dim, hidden)
        self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr=self.actor_lr)
        self.critic = Critic(s_dim, a_dim, hidden)
        self.critic_target = Critic(s_dim, a_dim, hidden)
        self.critic_opt = torch.optim.Adam(self.critic.parameters(), lr=self.critic_lr)

        # 初始化记忆库
        self.memory = Memory(capacity, batch_size, self.device)

        # 是否继承以前的成果
        if not os.listdir(self.path + '/Net'):
            # 没有以前的东西可以继承
            print('init completed')
            self.actor_target.load_state_dict(self.actor.state_dict())
            self.critic_target.load_state_dict(self.critic.state_dict())
        else:
            # 继承以前的网络与记忆
            print('loading completed')
            self.actor.load_state_dict(torch.load(self.path + '/Net/Actor.pth'))
            self.actor_target.load_state_dict(torch.load(self.path + '/Net/Actor_Target.pth'))
            self.critic.load_state_dict(torch.load(self.path + '/Net/Critic.pth'))
            self.critic_target.load_state_dict(torch.load(self.path + '/Net/Critic_Target.pth'))
            with open(self.path + '/Net/Memory.json', 'r') as f:
                self.memory.memory = json.load(f)
            with open(self.path + '/Net/Counter.json', 'r') as f:
                self.memory.counter = json.load(f)
            with open(self.path + '/Net/Var.json', 'r') as f:
                self.var = json.load(f)

    def choose_action(self, s):
        with torch.no_grad():
            s = torch.tensor(s, dtype=torch.float)
            a = self.actor(s).numpy()
        a = np.clip(np.random.normal(loc=a, scale=self.var), -1., 1.)
        # 行动:仅为pitch_pos
        return a

    def store_transition(self, s, a, s_, r, done):
        # 向记忆库中存储经历
        self.memory.store_transition(s, a, s_, r, done)
        if self.memory.counter >= self.start_lr_step:
            s, a, s_, r, done = self.memory.get_sample()
            self._learn(s, a, s_, r, done)

    def store_network(self):
        # print('I stored actor in:', self.path+'/Net/Actor.pth')
        torch.save(self.actor.state_dict(), self.path + '/Net/Actor.pth')
        torch.save(self.actor_target.state_dict(), self.path + '/Net/Actor_Target.pth')
        torch.save(self.critic.state_dict(), self.path + '/Net/Critic.pth')
        torch.save(self.critic_target.state_dict(), self.path + '/Net/Critic_Target.pth')
        with open(self.path + '/Net/Memory.json', 'w') as f:
            json.dump(self.memory.memory, f)
        with open(self.path + '/Net/Counter.json', 'w') as f:
            json.dump(self.memory.counter, f)
        with open(self.path + '/Net/Var.json', 'w') as f:
            json.dump(self.var, f)

        print(self.var, self.memory.counter)

    def _learn(self, s, a, s_, r, done):
        # 更新critic
        td_target = r + (1-done) * self.gamma * self.critic_target(s_, self.actor_target(s_))
        q = self.critic(s, a)
        critic_loss = F.mse_loss(q, td_target)
        self.critic_opt.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

        # 更新actor
        q = self.critic(s, self.actor(s))
        actor_loss = -torch.mean(q)
        self.actor_opt.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()

        # 更新target网络
        _soft_update(self.critic_target, self.critic, self.critic_tau)
        _soft_update(self.actor_target, self.actor, self.actor_tau)

        # update variance
        self.var = max(self.var * self.var_decay, self.var_min)