Ejemplo n.º 1
0
class A2CAgent():
    """
    init function
    input: env, which is the CartPole-v0
           gamma, 0.99 in this case
           lr, learning rate is 1e-4

    define: env = env, which is the CartPole-v0
            obs_dim: 4 obervations
                    Observation:
                    Type: Box(4)
                    Num	Observation                 Min         Max
                    0	Cart Position             -4.8            4.8
                    1	Cart Velocity             -Inf            Inf
                    2	Pole Angle                 -24 deg        24 deg
                    3	Pole Velocity At Tip      -Inf            Inf

            action_dim: 2 actions
                        Actions:
                        Type: Discrete(2)
                        Num	Action
                        0	Push cart to the left
                        1	Push cart to the right

            value_network: two layer network with input 4 (observation dim) and output 1 (reward?)
            policy_network: two layer network with input 4 (observation dim) and output 2 (action dim)

            value and policy optimizer using default Adam and learning rate
    """
    def __init__(self, env, gamma, lr):

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.lr = lr

        self.value_network = ValueNetwork(self.obs_dim, 1)
        self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim)

        self.value_optimizer = optim.Adam(self.value_network.parameters(),
                                          lr=self.lr)
        self.policy_optimizer = optim.Adam(self.policy_network.parameters(),
                                           lr=self.lr)

    """
    input state to get the next action
    
    using policy network to get the next state by using softmax
    """

    def get_action(self, state):
        state = torch.FloatTensor(state)
        logits = self.policy_network.forward(state)
        dist = F.softmax(logits, dim=0)
        probs = Categorical(dist)

        return probs.sample().cpu().detach().item()

    """
    form trajectory get all of the information, and calculated the discounted_rewards
    use value network to train the states with new values and compute the loss between the value and target value by using MSE
    same logic for policy network
    
    FloatTensor = FLOAT TYPE ARRAY
   
   t
tensor([[1, 2, 3],
        [4, 5, 6]])
t.view(-1,1)
tensor([[1],
        [2],
        [3],
        [4],
        [5],
        [6]]) 
     
    """

    def compute_loss(self, trajectory):
        states = torch.FloatTensor([sars[0] for sars in trajectory])
        actions = torch.LongTensor([sars[1]
                                    for sars in trajectory]).view(-1, 1)
        rewards = torch.FloatTensor([sars[2] for sars in trajectory])
        next_states = torch.FloatTensor([sars[3] for sars in trajectory])
        dones = torch.FloatTensor([sars[4] for sars in trajectory]).view(-1, 1)

        # compute value target

        ## Two for loop to calculate the discounted reward for each one
        discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma ** i for i in range(rewards[j:].size(0))]) \
                                        * rewards[j:]) for j in
                              range(rewards.size(0))]  # sorry, not the most readable code.
        value_targets = rewards.view(
            -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1)

        # compute value loss
        values = self.value_network.forward(states)
        value_loss = F.mse_loss(values, value_targets.detach())

        # compute policy loss with entropy bonus
        logits = self.policy_network.forward(states)
        dists = F.softmax(logits, dim=1)
        probs = Categorical(dists)

        # compute entropy bonus
        entropy = []
        for dist in dists:
            entropy.append(-torch.sum(dist.mean() * torch.log(dist)))
        entropy = torch.stack(entropy).sum()

        advantage = value_targets - values
        policy_loss = -probs.log_prob(actions.view(actions.size(0))).view(
            -1, 1) * advantage.detach()
        policy_loss = policy_loss.mean() - 0.001 * entropy

        return value_loss, policy_loss

    """
        zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls).

        loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation.
        
        opt.step() causes the optimizer to take a step based on the gradients of the parameters.
        """

    def update(self, trajectory):
        value_loss, policy_loss = self.compute_loss(trajectory)

        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()
Ejemplo n.º 2
0
class SACAgent:
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        # self.action_range = [env.action_space.low, env.action_space.high]
        # TODO: as a simple demo, I changed here; for the implementation, we should pass this as parameters
        self.action_range = [[-1, 1], [-1, 1]]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = 2
        # self.action_dim = 1

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)

    # pi: state -> acton
    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        mean, log_std = self.policy_net.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        action = action.cpu().detach().squeeze(0).numpy()

        return self.rescale_action(action)

    def rescale_action(self, action):
        '''if action < 0.5:
            return 0
        else:
            return 1'''
        scaled_action = []
        for idx, a in enumerate(action):
            action_range = self.action_range[idx]
            a = (action_range[1] - action_range[0]) / 2.0 + (
                action_range[1] + action_range[0]) / 2.0
            scaled_action.append(a)
        return scaled_action

    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        next_actions, next_log_pi = self.policy_net.sample(next_states)
        next_q1 = self.q_net1(next_states, next_actions)
        next_q2 = self.q_net2(next_states, next_actions)
        next_v = self.target_value_net(next_states)

        # value Loss
        next_v_target = torch.min(next_q1, next_q2) - next_log_pi
        curr_v = self.value_net.forward(states)
        v_loss = F.mse_loss(curr_v, next_v_target.detach())

        #TODO: Question: why using 2 Q-networks?
        # To reduce bias in training.

        # q loss
        curr_q1 = self.q_net1.forward(states, actions)
        curr_q2 = self.q_net2.forward(states, actions)
        expected_q = rewards + (1 - dones) * self.gamma * next_v
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # update value network and q networks
        self.value_optimizer.zero_grad()
        v_loss.backward()
        self.value_optimizer.step()

        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()

        # delayed update for policy net and target value nets
        # TODO: Question: what does this part do?
        # The original paper mentioned 2 methods for approximating the value function
        # 1. the EMA of policy weights to update the Q network
        # 2. periodical update of the policy network, which is used in this code
        if self.update_step % self.delay_step == 0:
            new_actions, log_pi = self.policy_net.sample(states)
            min_q = torch.min(self.q_net1.forward(states, new_actions),
                              self.q_net2.forward(states, new_actions))
            policy_loss = (log_pi - min_q).mean()

            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()

            # target networks
            for target_param, param in zip(self.target_value_net.parameters(),
                                           self.value_net.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

        self.update_step += 1