def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__policy = Dueling_DQN(action_dim, device).to(device) self.__target = Dueling_DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(Dueling_DQN.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval()
class Agent(object): def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, use_dueling=False, use_DDQN=False, use_PR=False, #Prioritized Experience Replay ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.use_dueling = use_dueling self.use_DDQN = use_DDQN self.use_PR = use_PR if not use_dueling: self.__policy = DQN(action_dim, device).to(device) self.__target = DQN(action_dim, device).to(device) else: self.__policy = Dueling_DQN(action_dim, device).to(device) self.__target = Dueling_DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(self.__policy.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval() def run(self, state: TensorStack4, training: bool = False) -> int: """run suggests an action for the given state.""" if training: self.__eps -= \ (self.__eps_start - self.__eps_final) / self.__eps_decay self.__eps = max(self.__eps, self.__eps_final) if self.__r.random() > self.__eps: with torch.no_grad(): return self.__policy(state).max(1).indices.item() return self.__r.randint(0, self.__action_dim - 1) def learn(self, memory: ReplayMemory, batch_size: int) -> float: """learn trains the value network via TD-learning.""" if self.use_PR: state_batch, action_batch, reward_batch, next_batch, done_batch, idxs, ISWeights = \ memory.sample(batch_size) else: state_batch, action_batch, reward_batch, next_batch, done_batch = \ memory.sample(batch_size) if self.use_DDQN: actions_value = self.__policy(next_batch.float()) max_val_action = actions_value.max(1)[1].unsqueeze(-1) actions_value = self.__target(next_batch.float()).detach() expected = reward_batch + (self.__gamma * actions_value.gather( 1, max_val_action)) * (1. - done_batch) values = self.__policy(state_batch.float()).gather(1, action_batch) else: values = self.__policy(state_batch.float()).gather(1, action_batch) values_next = self.__target( next_batch.float()).max(1).values.detach() expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch if self.use_PR: abs_errors = torch.abs(expected - values).data.cpu().numpy() # update priority memory.batch_update(idxs, abs_errors) loss = ( ISWeights * F.smooth_l1_loss(values, expected, reduction='none')).mean() else: loss = F.smooth_l1_loss(values, expected) self.__optimizer.zero_grad() loss.backward() #for param in self.__policy.parameters(): # param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item() def sync(self) -> None: """sync synchronizes the weights from the policy network to the target network.""" self.__target.load_state_dict(self.__policy.state_dict()) def save(self, path: str) -> None: """save saves the state dict of the policy network.""" torch.save(self.__policy.state_dict(), path)
class Agent(object): def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.policy = Dueling_DQN(action_dim, device).to(device) self.target = Dueling_DQN(action_dim, device).to(device) if restore is None: self.policy.apply(Dueling_DQN.init_weights) else: self.policy.load_state_dict(torch.load(restore)) self.target.load_state_dict(self.policy.state_dict()) self.__optimizer = optim.Adam( self.policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.target.eval() def run(self, state: TensorStack4, training: bool = False) -> int: """run suggests an action for the given state.""" if training: self.__eps -= \ (self.__eps_start - self.__eps_final) / self.__eps_decay self.__eps = max(self.__eps, self.__eps_final) if self.__r.random() > self.__eps: with torch.no_grad(): return self.policy(state).max(1).indices.item() return self.__r.randint(0, self.__action_dim - 1) def learn(self, memory: ReplayMemory, batch_size: int) -> float: """learn trains the value network via TD-learning.""" idxs, (state_batch, next_batch, action_batch, reward_batch, done_batch), is_weights = memory.sample(batch_size) y_batch = [] current_Q_batch = self.policy(next_batch).cpu().data.numpy() max_action_next = np.argmax(current_Q_batch, axis=1) target_Q_batch = self.target(next_batch) for i in range(batch_size): if done_batch[i]: y_batch.append(reward_batch[i]) else: target_Q_value = target_Q_batch[i, max_action_next[i]] y_batch.append(reward_batch[i] + self.__gamma * target_Q_value) y_batch = torch.stack(y_batch) values = self.policy(state_batch).gather(1, action_batch) abs_error = torch.abs(y_batch - values) memory.batch_update(idxs, abs_error) loss = (torch.FloatTensor(is_weights).to(self.__device) * F.mse_loss(values, y_batch)).mean() self.__optimizer.zero_grad() loss.backward() for param in self.policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item() def sync(self) -> None: """sync synchronizes the weights from the policy network to the target network.""" self.target.load_state_dict(self.policy.state_dict()) def save(self, path: str) -> None: """save saves the state dict of the policy network.""" torch.save(self.policy.state_dict(), path)
class Agent(object): def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__policy = Dueling_DQN(action_dim, device).to(device) self.__target = Dueling_DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(Dueling_DQN.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval() def run(self, state: TensorStack4, training: bool = False) -> int: """run suggests an action for the given state.""" if training: self.__eps -= \ (self.__eps_start - self.__eps_final) / self.__eps_decay self.__eps = max(self.__eps, self.__eps_final) if self.__r.random() > self.__eps: with torch.no_grad(): return self.__policy(state).max(1).indices.item() return self.__r.randint(0, self.__action_dim - 1) def learn(self, memory: ReplayMemory, batch_size: int) -> float: """learn trains the value network via TD-learning.""" state_batch, action_batch, reward_batch, next_batch, done_batch = \ memory.sample(batch_size) values = self.__policy(state_batch.float()).gather(1, action_batch) values_next = self.__target(next_batch.float()).max(1).values.detach() expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch loss = F.smooth_l1_loss(values, expected) self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item() def sync(self) -> None: """sync synchronizes the weights from the policy network to the target network.""" self.__target.load_state_dict(self.__policy.state_dict()) def save(self, path: str) -> None: """save saves the state dict of the policy network.""" torch.save(self.__policy.state_dict(), path)
class Agent(object): #就是那个反弹平板模型 def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, #restore默认设为None ) -> None: self.__action_dim = action_dim #设置模型初始参数 self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__policy = Dueling_DQN(action_dim, device).to(device) #可以设置运行在cpu还是gpu上 self.__target = Dueling_DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(Dueling_DQN.init_weights) #采用dqn初始权重 else: self.__policy.load_state_dict( torch.load(restore)) #否则更新为restore中的权重 self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( #优化器 self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval() # 采用epsilon-greedy方法选择action def run(self, state: TensorStack4, training: bool = False) -> int: #运行 """run suggests an action for the given state.""" if training: #利用eps_start/eps_final这些动态衰减epsilon self.__eps -= \ (self.__eps_start - self.__eps_final) / self.__eps_decay #这是啥意思??貌似是为了调整学习率 self.__eps = max(self.__eps, self.__eps_final) #有一定的概率选择使函数值最大的action if self.__r.random() > self.__eps: with torch.no_grad(): return self.__policy(state).max( 1).indices.item() #在policy网络选择最大Q值action #否则随机选择action return self.__r.randint(0, self.__action_dim - 1) ''' #从memory中提取state action reward next来训练网络 def learn(self, memory: ReplayMemory, batch_size: int) -> float: """learn trains the value network via TD-learning.""" #从replay buffer当中采样,从经验回放集合中采样batch_size个样本,计算当前目标Q值 indices_batch, (state_batch, action_batch, reward_batch, next_batch, done_batch), p_batch = \ memory.sample(batch_size) #使用行为网络计算值函数 Q_j values = self.__policy(state_batch.float()).gather(1, action_batch) #使用目标网络计算 Q_{j+1}并计算 expected = r_{j+1} + max(a') Q_{j+1} #其中(1-done_batch)用于判断是否terminal,如果是就退化到expected = r_{j+1} #这里相当于q-learning中的更新公式的一部分 values_next = self.__target(next_batch.float()).max(1).values.detach() #在目标网络更新Q值 expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch tp_error = torch.abs(values - expected) memory.update(indices_batch, tp_error) # 根据目标函数 (Q_j - expected)^2来梯度下降 loss = (torch.FloatTensor(p_batch).to(self.__device) * F.mse_loss(values - expected)).mean() self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item() ''' def learn(self, memory: ReplayMemory, batch_size: int) -> float: """learn trains the value network via TD-learning.""" # 从replay buffer当中采样,从经验回放集合中采样batch_size个样本,计算当前目标Q值 indices, (state_batch, next_batch, action_batch, reward_batch, done_batch), is_weights = \ memory.sample(batch_size) # 使用行为网络计算值函数 Q_j values = self.__policy(state_batch).gather(1, action_batch) expected = [] policy_Q_batch = self.__policy(next_batch).cpu().data.numpy() max_action_next = np.argmax(policy_Q_batch, axis=1) target_Q_batch = self.__target(next_batch) for i in range(batch_size): if done_batch[i]: expected.append(reward_batch[i]) else: target_Q_value = target_Q_batch[i, max_action_next[i]] expected.append(reward_batch[i] + self.__gamma * target_Q_value) expected = torch.stack(expected) TD_error = torch.abs(expected - values) memory.update(indices, TD_error) # 根据目标函数 (Q_j - expected)^2来梯度下降 loss = (torch.FloatTensor(is_weights).to(self.__device) * F.mse_loss(values, expected)).mean() self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item() #同步target网络和policy网络,即目标和行为网络 def sync(self) -> None: # """sync synchronizes the weights from the policy network to the target network.""" self.__target.load_state_dict(self.__policy.state_dict()) #保存policy network def save(self, path: str) -> None: #保存结果 """save saves the state dict of the policy network.""" torch.save(self.__policy.state_dict(), path)