Esempio n. 1
0
    def __init__(
            self,
            action_dim: int,
            device: TorchDevice,
            gamma: float,
            seed: int,

            eps_start: float,
            eps_final: float,
            eps_decay: float,

            restore: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        self.__policy = Dueling_DQN(action_dim, device).to(device)
        self.__target = Dueling_DQN(action_dim, device).to(device)
        if restore is None:
            self.__policy.apply(Dueling_DQN.init_weights)
        else:
            self.__policy.load_state_dict(torch.load(restore))
        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()
class Agent(object):
    def __init__(
            self,
            action_dim: int,
            device: TorchDevice,
            gamma: float,
            seed: int,
            eps_start: float,
            eps_final: float,
            eps_decay: float,
            restore: Optional[str] = None,
            use_dueling=False,
            use_DDQN=False,
            use_PR=False,  #Prioritized Experience Replay
    ) -> None:
        self.__action_dim = action_dim
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        self.use_dueling = use_dueling
        self.use_DDQN = use_DDQN
        self.use_PR = use_PR

        if not use_dueling:
            self.__policy = DQN(action_dim, device).to(device)
            self.__target = DQN(action_dim, device).to(device)
        else:
            self.__policy = Dueling_DQN(action_dim, device).to(device)
            self.__target = Dueling_DQN(action_dim, device).to(device)

        if restore is None:
            self.__policy.apply(self.__policy.init_weights)
        else:
            self.__policy.load_state_dict(torch.load(restore))
        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()

    def run(self, state: TensorStack4, training: bool = False) -> int:
        """run suggests an action for the given state."""
        if training:
            self.__eps -= \
                (self.__eps_start - self.__eps_final) / self.__eps_decay
            self.__eps = max(self.__eps, self.__eps_final)

        if self.__r.random() > self.__eps:
            with torch.no_grad():
                return self.__policy(state).max(1).indices.item()
        return self.__r.randint(0, self.__action_dim - 1)

    def learn(self, memory: ReplayMemory, batch_size: int) -> float:
        """learn trains the value network via TD-learning."""
        if self.use_PR:
            state_batch, action_batch, reward_batch, next_batch, done_batch, idxs, ISWeights = \
                memory.sample(batch_size)
        else:
            state_batch, action_batch, reward_batch, next_batch, done_batch = \
                memory.sample(batch_size)

        if self.use_DDQN:
            actions_value = self.__policy(next_batch.float())
            max_val_action = actions_value.max(1)[1].unsqueeze(-1)

            actions_value = self.__target(next_batch.float()).detach()
            expected = reward_batch + (self.__gamma * actions_value.gather(
                1, max_val_action)) * (1. - done_batch)
            values = self.__policy(state_batch.float()).gather(1, action_batch)
        else:
            values = self.__policy(state_batch.float()).gather(1, action_batch)
            values_next = self.__target(
                next_batch.float()).max(1).values.detach()
            expected = (self.__gamma * values_next.unsqueeze(1)) * \
                (1. - done_batch) + reward_batch

        if self.use_PR:
            abs_errors = torch.abs(expected - values).data.cpu().numpy()
            # update priority
            memory.batch_update(idxs, abs_errors)
            loss = (
                ISWeights *
                F.smooth_l1_loss(values, expected, reduction='none')).mean()
        else:
            loss = F.smooth_l1_loss(values, expected)

        self.__optimizer.zero_grad()
        loss.backward()
        #for param in self.__policy.parameters():
        #    param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()

    def sync(self) -> None:
        """sync synchronizes the weights from the policy network to the target
        network."""
        self.__target.load_state_dict(self.__policy.state_dict())

    def save(self, path: str) -> None:
        """save saves the state dict of the policy network."""
        torch.save(self.__policy.state_dict(), path)
Esempio n. 3
0
class Agent(object):
    def __init__(
        self,
        action_dim: int,
        device: TorchDevice,
        gamma: float,
        seed: int,
        eps_start: float,
        eps_final: float,
        eps_decay: float,
        restore: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        self.policy = Dueling_DQN(action_dim, device).to(device)
        self.target = Dueling_DQN(action_dim, device).to(device)
        if restore is None:
            self.policy.apply(Dueling_DQN.init_weights)
        else:
            self.policy.load_state_dict(torch.load(restore))
        self.target.load_state_dict(self.policy.state_dict())
        self.__optimizer = optim.Adam(
            self.policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.target.eval()

    def run(self, state: TensorStack4, training: bool = False) -> int:
        """run suggests an action for the given state."""
        if training:
            self.__eps -= \
                (self.__eps_start - self.__eps_final) / self.__eps_decay
            self.__eps = max(self.__eps, self.__eps_final)

        if self.__r.random() > self.__eps:
            with torch.no_grad():
                return self.policy(state).max(1).indices.item()
        return self.__r.randint(0, self.__action_dim - 1)

    def learn(self, memory: ReplayMemory, batch_size: int) -> float:
        """learn trains the value network via TD-learning."""
        idxs, (state_batch, next_batch, action_batch, reward_batch,
               done_batch), is_weights = memory.sample(batch_size)

        y_batch = []
        current_Q_batch = self.policy(next_batch).cpu().data.numpy()
        max_action_next = np.argmax(current_Q_batch, axis=1)
        target_Q_batch = self.target(next_batch)

        for i in range(batch_size):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                target_Q_value = target_Q_batch[i, max_action_next[i]]
                y_batch.append(reward_batch[i] + self.__gamma * target_Q_value)

        y_batch = torch.stack(y_batch)
        values = self.policy(state_batch).gather(1, action_batch)

        abs_error = torch.abs(y_batch - values)
        memory.batch_update(idxs, abs_error)

        loss = (torch.FloatTensor(is_weights).to(self.__device) *
                F.mse_loss(values, y_batch)).mean()

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()

    def sync(self) -> None:
        """sync synchronizes the weights from the policy network to the target
        network."""
        self.target.load_state_dict(self.policy.state_dict())

    def save(self, path: str) -> None:
        """save saves the state dict of the policy network."""
        torch.save(self.policy.state_dict(), path)
Esempio n. 4
0
class Agent(object):

    def __init__(
            self,
            action_dim: int,
            device: TorchDevice,
            gamma: float,
            seed: int,

            eps_start: float,
            eps_final: float,
            eps_decay: float,

            restore: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        self.__policy = Dueling_DQN(action_dim, device).to(device)
        self.__target = Dueling_DQN(action_dim, device).to(device)
        if restore is None:
            self.__policy.apply(Dueling_DQN.init_weights)
        else:
            self.__policy.load_state_dict(torch.load(restore))
        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()

    def run(self, state: TensorStack4, training: bool = False) -> int:
        """run suggests an action for the given state."""
        if training:
            self.__eps -= \
                (self.__eps_start - self.__eps_final) / self.__eps_decay
            self.__eps = max(self.__eps, self.__eps_final)

        if self.__r.random() > self.__eps:
            with torch.no_grad():
                return self.__policy(state).max(1).indices.item()
        return self.__r.randint(0, self.__action_dim - 1)

    def learn(self, memory: ReplayMemory, batch_size: int) -> float:
        """learn trains the value network via TD-learning."""
        state_batch, action_batch, reward_batch, next_batch, done_batch = \
            memory.sample(batch_size)

        values = self.__policy(state_batch.float()).gather(1, action_batch)
        values_next = self.__target(next_batch.float()).max(1).values.detach()
        expected = (self.__gamma * values_next.unsqueeze(1)) * \
            (1. - done_batch) + reward_batch
        loss = F.smooth_l1_loss(values, expected)

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()

    def sync(self) -> None:
        """sync synchronizes the weights from the policy network to the target
        network."""
        self.__target.load_state_dict(self.__policy.state_dict())

    def save(self, path: str) -> None:
        """save saves the state dict of the policy network."""
        torch.save(self.__policy.state_dict(), path)
Esempio n. 5
0
class Agent(object):  #就是那个反弹平板模型
    def __init__(
            self,
            action_dim: int,
            device: TorchDevice,
            gamma: float,
            seed: int,
            eps_start: float,
            eps_final: float,
            eps_decay: float,
            restore: Optional[str] = None,  #restore默认设为None
    ) -> None:
        self.__action_dim = action_dim  #设置模型初始参数
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        self.__policy = Dueling_DQN(action_dim,
                                    device).to(device)  #可以设置运行在cpu还是gpu上
        self.__target = Dueling_DQN(action_dim, device).to(device)
        if restore is None:
            self.__policy.apply(Dueling_DQN.init_weights)  #采用dqn初始权重
        else:
            self.__policy.load_state_dict(
                torch.load(restore))  #否则更新为restore中的权重
        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(  #优化器
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()

    # 采用epsilon-greedy方法选择action
    def run(self, state: TensorStack4, training: bool = False) -> int:  #运行
        """run suggests an action for the given state."""
        if training:
            #利用eps_start/eps_final这些动态衰减epsilon
            self.__eps -= \
                (self.__eps_start - self.__eps_final) / self.__eps_decay   #这是啥意思??貌似是为了调整学习率
            self.__eps = max(self.__eps, self.__eps_final)

        #有一定的概率选择使函数值最大的action
        if self.__r.random() > self.__eps:
            with torch.no_grad():
                return self.__policy(state).max(
                    1).indices.item()  #在policy网络选择最大Q值action
        #否则随机选择action
        return self.__r.randint(0, self.__action_dim - 1)

    '''
    #从memory中提取state action reward next来训练网络
    def learn(self, memory: ReplayMemory, batch_size: int) -> float:     
        """learn trains the value network via TD-learning."""
        #从replay buffer当中采样,从经验回放集合中采样batch_size个样本,计算当前目标Q值
        indices_batch, (state_batch, action_batch, reward_batch, next_batch, done_batch), p_batch = \
            memory.sample(batch_size)   
        #使用行为网络计算值函数 Q_j
        values = self.__policy(state_batch.float()).gather(1, action_batch)
        
        #使用目标网络计算 Q_{j+1}并计算 expected = r_{j+1} + max(a') Q_{j+1}
        #其中(1-done_batch)用于判断是否terminal,如果是就退化到expected = r_{j+1}
        #这里相当于q-learning中的更新公式的一部分
        values_next = self.__target(next_batch.float()).max(1).values.detach()  #在目标网络更新Q值
        expected = (self.__gamma * values_next.unsqueeze(1)) * \
            (1. - done_batch) + reward_batch
        
        tp_error = torch.abs(values - expected)
        memory.update(indices_batch, tp_error)
        
        # 根据目标函数 (Q_j - expected)^2来梯度下降
        loss = (torch.FloatTensor(p_batch).to(self.__device) * F.mse_loss(values - expected)).mean()

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()
    '''

    def learn(self, memory: ReplayMemory, batch_size: int) -> float:
        """learn trains the value network via TD-learning."""
        # 从replay buffer当中采样,从经验回放集合中采样batch_size个样本,计算当前目标Q值
        indices, (state_batch, next_batch, action_batch, reward_batch, done_batch), is_weights = \
            memory.sample(batch_size)
        # 使用行为网络计算值函数 Q_j
        values = self.__policy(state_batch).gather(1, action_batch)

        expected = []
        policy_Q_batch = self.__policy(next_batch).cpu().data.numpy()
        max_action_next = np.argmax(policy_Q_batch, axis=1)
        target_Q_batch = self.__target(next_batch)

        for i in range(batch_size):
            if done_batch[i]:
                expected.append(reward_batch[i])
            else:
                target_Q_value = target_Q_batch[i, max_action_next[i]]
                expected.append(reward_batch[i] +
                                self.__gamma * target_Q_value)

        expected = torch.stack(expected)
        TD_error = torch.abs(expected - values)
        memory.update(indices, TD_error)

        # 根据目标函数 (Q_j - expected)^2来梯度下降
        loss = (torch.FloatTensor(is_weights).to(self.__device) *
                F.mse_loss(values, expected)).mean()

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()

    #同步target网络和policy网络,即目标和行为网络
    def sync(self) -> None:  #
        """sync synchronizes the weights from the policy network to the target
        network."""
        self.__target.load_state_dict(self.__policy.state_dict())

    #保存policy network
    def save(self, path: str) -> None:  #保存结果
        """save saves the state dict of the policy network."""
        torch.save(self.__policy.state_dict(), path)