def learn(self, memory: ReplayMemory, batch_size: int, choice: int) -> float: """learn trains the value network via TD-learning.""" ##修改项 if (choice == 0): # 普通memory state_batch, action_batch, reward_batch, next_batch, done_batch = memory.sample(batch_size) else: # PERmemory state_batch, action_batch, reward_batch, next_batch, done_batch, idx_batch = \ memory.sample(batch_size) #### values = self.__policy(state_batch.float()).gather(1, action_batch) # 每一列按action_batch取元素 values_next = self.__target(next_batch.float()).max(1).values.detach() # 最大的作为下一个的value expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch # Loss=values-expected if (choice == 0): ##### loss = F.smooth_l1_loss(values, expected) else: # PERmemory loss_batch = F.smooth_l1_loss(values, expected, reduction='none') # TD error loss = torch.mean(loss_batch, dim=0) # loss.requires_grad = True memory.update(loss_batch.detach(), idx_batch) ##修改项 self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): # 把参数加紧到[-1,1],原地修改 param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item()
def learn(self, memory: ReplayMemory, batch_size: int) -> float: """learn trains the value network via TD-learning.""" idxs, (state_batch, next_batch, action_batch, reward_batch, done_batch), is_weights = memory.sample(batch_size) y_batch = [] current_Q_batch = self.policy(next_batch).cpu().data.numpy() max_action_next = np.argmax(current_Q_batch, axis=1) target_Q_batch = self.target(next_batch) for i in range(batch_size): if done_batch[i]: y_batch.append(reward_batch[i]) else: target_Q_value = target_Q_batch[i, max_action_next[i]] y_batch.append(reward_batch[i] + self.__gamma * target_Q_value) y_batch = torch.stack(y_batch) values = self.policy(state_batch).gather(1, action_batch) abs_error = torch.abs(y_batch - values) memory.batch_update(idxs, abs_error) loss = (torch.FloatTensor(is_weights).to(self.__device) * F.mse_loss(values, y_batch)).mean() self.__optimizer.zero_grad() loss.backward() for param in self.policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item()
def learn(self, memory: ReplayMemory, batch_size: int) -> float: """learn trains the value network via TD-learning.""" state_batch, action_batch, reward_batch, next_batch, done_batch,indices = \ memory.sample(batch_size) values = self.__policy(state_batch.float()).gather(1, action_batch) #print(type(values),values.size()) values_next = self.__target(next_batch.float()).max(1).values.detach() expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch #print(type(values)) #print(type(values_next)) #print(values.size(),values_next.size(),expected.size()) error = reward_batch + self.__gamma * values_next.unsqueeze(1) - values #error = values_next.unsqueeze(1) - values for i in range(batch_size): memory.Tree.update( int(indices[i]) + memory.Tree.capacity - 1, abs(float(error[i]))) #print(error.size()) #print(error) loss = F.smooth_l1_loss(values, expected) self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item()
def learn(self, memory: ReplayMemory, batch_size: int) -> float: """learn trains the value network via TD-learning.""" # 从replay buffer当中采样!!用于更新policy网络 state_batch, action_batch, reward_batch, next_batch, done_batch = \ memory.sample(batch_size) #将所有变量转为张量。 #使用行为网络计算值函数 Q_j values = self.__policy(state_batch.float()).gather( 1, action_batch) # Q_j对应有state_batch和action_batch #使用目标网络计算 Q_{j+1}并计算 expected = r_{j+1} + max(a') Q_{j+1} #其中(1-done_batch)用于判断是否terminal,如果是就退化到expected = r_{j+1} #这里相当于q-learning中的更新公式的一部分。在target网络中计算Q值。 values_next = self.__target(next_batch.float()).max(1).values.detach() expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch #根据目标函数 (Q_j - expected)^2来梯度下降 loss = F.smooth_l1_loss(values, expected) self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item()
def learn(self, memory: ReplayMemory, batch_size: int) -> float: # 返回vlue """learn trains the value network via TD-learning.""" state_batch, action_batch, reward_batch, next_batch, done_batch = \ memory.sample(batch_size)#随机选取一个样本 # SGD优化的基本要求之一是训练数据是独立且均匀分布的 # 当Agent与环境交互时,经验元组的序列可以高度相关,所以要打乱采样 #将样本送入学习 values = self.__policy(state_batch.float()).gather( 1, action_batch) #Q表:value=Q(s,a) ##########dueling DQN修改思路:拆分Q(s,a)=V(s)+A(s,a),其中V(s)为状态s本身的价值,A(s,a)为动作a的价值########## ##########V(s)是一个标量,A(s,a)是一个向量。在相加时V(s)会自动复制到与A(s,a)维度一致########## values_next = self.__target( next_batch.float()).max(1).values.detach() #Q'表,但是具体参数不清楚 expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch #当完成了(done=1),y_j=r_j;否则y_j=r_j+q()见论文算法 loss = F.smooth_l1_loss(values, expected) # 损失函数 self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item()
def learn(self, memory: ReplayMemory, batch_size: int) -> float: """learn trains the value network via TD-learning.""" if self.use_PR: state_batch, action_batch, reward_batch, next_batch, done_batch, idxs, ISWeights = \ memory.sample(batch_size) else: state_batch, action_batch, reward_batch, next_batch, done_batch = \ memory.sample(batch_size) if self.use_DDQN: actions_value = self.__policy(next_batch.float()) max_val_action = actions_value.max(1)[1].unsqueeze(-1) actions_value = self.__target(next_batch.float()).detach() expected = reward_batch + (self.__gamma * actions_value.gather( 1, max_val_action)) * (1. - done_batch) values = self.__policy(state_batch.float()).gather(1, action_batch) else: values = self.__policy(state_batch.float()).gather(1, action_batch) values_next = self.__target( next_batch.float()).max(1).values.detach() expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch if self.use_PR: abs_errors = torch.abs(expected - values).data.cpu().numpy() # update priority memory.batch_update(idxs, abs_errors) loss = ( ISWeights * F.smooth_l1_loss(values, expected, reduction='none')).mean() else: loss = F.smooth_l1_loss(values, expected) self.__optimizer.zero_grad() loss.backward() #for param in self.__policy.parameters(): # param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item()
def learn(self, memory: ReplayMemory, batch_size: int) -> float: """learn trains the value network via TD-learning.应该是Q network """ state_batch, action_batch, reward_batch, \ next_batch, done_batch, idx_batch = memory.sample(batch_size) values = self.__policy(state_batch.float()).gather(1, action_batch) values_next = self.__target(next_batch.float()).max(1).values.detach() expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch #TD target loss_batch = F.smooth_l1_loss(values, expected, reduce=False) #TD error loss = torch.mean(loss_batch, dim=0) loss.requires_grad = True memory.update(loss_batch.detach(), idx_batch) self.__optimizer.zero_grad() loss.backward() #backward for param in self.__policy.parameters(): if param.grad is not None:#grad clamp to (-1,1) param.grad.data.clamp_(-1, 1) self.__optimizer.step() #update return loss.item()
def learn(self, memory: ReplayMemory, batch_size: int) -> float: """learn trains the value network via TD-learning.""" state_batch, action_batch, reward_batch, next_batch, done_batch, indices = \ memory.sample(batch_size) values = self.__policy(state_batch.float()).gather(1, action_batch) values_next = self.__target(next_batch.float()).max(1).values.detach() expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch loss = F.smooth_l1_loss(values, expected) self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() for i in range(batch_size): memory.update(indices[i], expected[i] - values[i]) del (indices) return loss.item()
def __init__(self, n_state, n_action, device='cpu'): #params self.n_state = n_state self.n_action = n_action self.device=device self.epsilon = 1.0 self.epsilon_decay = 0.999 self.epsilon_min = 0.1 self.discount_factor = 0.99 self.learning_rate= 0.000625 #0.001 self.batch_size= 32 self.num_step =0 self.num_exploration = 0 self.num_train = 0 self.model_update_interval = 10000 self.train_start = 50000 self.replay_memory_size = 100000 # self.replay_memory = deque(maxlen=self.replay_memory_size) self.memory = ReplayMemory(4 + 1, self.replay_memory_size, self.device) #model define #action-value function self.model = CNN(self.n_state, self.n_action).to(self.device) self.loss = nn.MSELoss().to(self.device) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate, eps=1.5e-4) #target action-value function self.target_model = CNN(self.n_state, self.n_action).to(self.device) self.model.apply(CNN.init_weights) self.target_model.eval() self.rand = random.Random() self.rand.seed(0)
def learn(self, memory: ReplayMemory, batch_size: int) -> float: """learn trains the value network via TD-learning.""" # 从replay buffer当中采样,从经验回放集合中采样batch_size个样本,计算当前目标Q值 indices, (state_batch, next_batch, action_batch, reward_batch, done_batch), is_weights = \ memory.sample(batch_size) # 使用行为网络计算值函数 Q_j values = self.__policy(state_batch).gather(1, action_batch) expected = [] policy_Q_batch = self.__policy(next_batch).cpu().data.numpy() max_action_next = np.argmax(policy_Q_batch, axis=1) target_Q_batch = self.__target(next_batch) for i in range(batch_size): if done_batch[i]: expected.append(reward_batch[i]) else: target_Q_value = target_Q_batch[i, max_action_next[i]] expected.append(reward_batch[i] + self.__gamma * target_Q_value) expected = torch.stack(expected) TD_error = torch.abs(expected - values) memory.update(indices, TD_error) # 根据目标函数 (Q_j - expected)^2来梯度下降 loss = (torch.FloatTensor(is_weights).to(self.__device) * F.mse_loss(values, expected)).mean() self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item()
def learn(self, memory: ReplayMemory, batch_size: int) -> float: state_batch, action_batch, reward_batch, next_batch,weight_batch, done_batch = \ memory.sample(batch_size) weight_batch = weight_batch.to(self.__device) values = self.__policy(state_batch.float()).gather(1, action_batch) values_next = self.__target(next_batch.float()).max(1).values.detach() expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch weight_batch /= weight_batch.mean() loss = F.smooth_l1_loss(weight_batch * values, weight_batch * expected) self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item()
def learn(self, memory: ReplayMemory, batch_size: int) -> float: # 训练 """learn trains the value network via TD-learning.""" state_batch, action_batch, reward_batch, next_batch, done_batch = memory.sample( batch_size) # 在经验池中选取一组transition样本集(minibatch) values = self.__policy(state_batch.float()).gather( 1, action_batch) # DQN输出y values_next = self.__target( next_batch.float()).max(1).values.detach() # max_a(Q(S',a)) expected = (self.__gamma * values_next.unsqueeze(1)) * ( 1. - done_batch) + reward_batch # Q-Learning计算Q(S,A) loss = F.smooth_l1_loss(values, expected) # 计算误差 # 更新网络参数三部曲 self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item() # 返回误差
new_seed = lambda: rand.randint(0, 1000_000) os.mkdir(SAVE_PREFIX) # 保存训练好的模型 torch.manual_seed(new_seed()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = MyEnv(device) # 创建一个环境,用于跑atari这个游戏 agent = Agent( # 创建一个agent env.get_action_dim(), # 游戏中动作的数量,一共有三个,分别是左右和不动 device, # 训练使用的设备 GAMMA, new_seed(), EPS_START, # epsilon的开始值 EPS_END, # epsilon的最小值 EPS_DECAY, # epsilon递减的 ) memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device) # 用来记录agent的动作于结果之间的联系,用于后面神经网络的训练 #### Training #### obs_queue: deque = deque(maxlen=5) done = True progressive = tqdm(range(MAX_STEPS), total=MAX_STEPS, ncols=50, leave=False, unit="b") for step in progressive: if done: observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs)
# The number of threads here needs to be adjusted based on the number of CPU cores available torch.set_num_threads(4) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = MyEnv(device) agent = Agent( env.get_action_dim(), device, GAMMA, new_seed(), EPS_START, EPS_END, EPS_DECAY, restore=restore, rlmodel=rlmodel, ) memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device) #### Training #### obs_queue: deque = deque(maxlen=5) done = True progressive = tqdm(range(MAX_STEPS), total=MAX_STEPS, ncols=50, leave=False, unit="b") for step in progressive: if done: observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs)
os.mkdir(SAVE_PREFIX) # 创建目录保存模型 torch.manual_seed(new_seed()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) env = MyEnv(device) # 环境 agent = Agent( # 智能体 env.get_action_dim(), # 3 device, # cuda GAMMA, # 0.99 new_seed(), EPS_START, # 1 EPS_END, # 0.1 EPS_DECAY, # 1e6 ) memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device) # 初始化经验池 #### Training #### obs_queue: deque = deque(maxlen=5) done = True progressive = tqdm(range(MAX_STEPS), total=MAX_STEPS, ncols=50, leave=False, unit="b") # 可视化进度条 for step in progressive: if done: # 开始新一轮环境 observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs)
for version in versions: #set_trace() print(version) dueling = False if version.find('dueling') == -1 else True stable = False if version.find('stable') == -1 else True if stable: action_queue = [] env = MyEnv(device) agent = Agent(env.get_action_dim(), device, GAMMA, new_seed(), EPS_START, EPS_END, EPS_DECAY, dueling, pretrained, stable * 0.1) if version.find('PER') != -1: memory = PERMemory(STACK_SIZE + 1, MEM_SIZE, device) #memory = Memory_Buffer_PER(MEM_SIZE) else: memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device) #memory = Memory_Buffer_PER(MEM_SIZE) #### Training #### obs_queue: deque = deque(maxlen=5) done = True avg_reward_arr = [] progressive = tqdm(range(MAX_STEPS), total=MAX_STEPS, ncols=50, leave=False, unit="b") for step in progressive: if done:
class DDQN: def __init__(self, n_state, n_action, device='cpu'): #params self.n_state = n_state self.n_action = n_action self.device=device self.epsilon = 1.0 self.epsilon_decay = 0.999 self.epsilon_min = 0.1 self.discount_factor = 0.99 self.learning_rate= 0.000625 #0.001 self.batch_size= 32 self.num_step =0 self.num_exploration = 0 self.num_train = 0 self.model_update_interval = 10000 self.train_start = 50000 self.replay_memory_size = 100000 # self.replay_memory = deque(maxlen=self.replay_memory_size) self.memory = ReplayMemory(4 + 1, self.replay_memory_size, self.device) #model define #action-value function self.model = CNN(self.n_state, self.n_action).to(self.device) self.loss = nn.MSELoss().to(self.device) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate, eps=1.5e-4) #target action-value function self.target_model = CNN(self.n_state, self.n_action).to(self.device) self.model.apply(CNN.init_weights) self.target_model.eval() self.rand = random.Random() self.rand.seed(0) def reset_params(self): self.num_step =0 self.num_exploration = 0 def save_model(self, path): torch.save(self.model.state_dict(),path) def load_model(self, path): self.model.load_state_dict(torch.load(save_path)) # def save_sample(self, sample): def save_sample(self, state_queue, action, reward, done): #sample = [state, action, reward, next_state, done] self.memory.push(state_queue, action, reward, done) # self.replay_memory.append(sample) #E-greedy in state def get_action(self, state, test= False): self.num_step +=1 if test: epsilon = self.epilson_min else: epsilon = self.epsilon if np.random.rand(1) < epsilon: self.num_exploration+=1 #action = random.randrange(self.n_action) # action = env.action_space.sample() action= self.rand.randint(0, self.n_action-1) else: # T means torh.Tensor # print(np.shape(list(state)[1:])) with torch.no_grad(): # T_state = torch.FloatTensor([list(state)[1:]]).to(self.device) # T_state = T_state.permute(0, 3, 1, 2) T_q = self.model(state) action = np.argmax(T_q.to('cpu').detach().numpy()) # action += 1 return action def epsilon_update(self): if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay #Update def train(self): state_batch, action_batch, reward_batch, next_batch, done_batch = self.memory.sample(self.batch_size) values = self.model(state_batch.float()).gather(1, action_batch) values_next = self.target_model(next_batch.float()).max(1).values.detach() expected = (self.discount_factor * values_next.unsqueeze(1)) * (1. - done_batch) + reward_batch loss = F.smooth_l1_loss(values, expected) self.optimizer.zero_grad() loss.backward() for param in self.model.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() self.num_train += 1 # mini_batch = random.sample(self.replay_memory, self.batch_size) # #print(np.shape(np.array(mini_batch[:,0]))) # # print(np.shape(mini_batch)) # mini_batch = np.array(mini_batch) # T_states = np.stack(mini_batch[:,0])[:,:4] # T_actions = np.stack(mini_batch[:,1]) # T_rewards = np.stack(mini_batch[:,2]) # T_next_states = np.stack(mini_batch[:,0])[:,1:] # T_dones = np.stack(mini_batch[:,3]) # T_states = torch.FloatTensor(T_states).to(self.device) # T_actions = torch.LongTensor(T_actions).to(self.device) # T_rewards = torch.FloatTensor(T_rewards).to(self.device) # T_next_states = torch.FloatTensor(T_next_states).to(self.device) # T_dones = torch.FloatTensor(T_dones).to(self.device) # T_q = self.model(T_states) # #_ shows max value, T_next_q shows index of max value # _, T_next_q = self.model(T_next_states).detach().max(1) # T_next_tq = self.target_model(T_next_states).detach() # T_next_tq = T_next_tq.gather(1, T_next_q.unsqueeze(1)) # T_next_a = T_next_tq.squeeze() # TD_target = torch.zeros((self.batch_size, self.n_action)).to(self.device) # for i in range(self.batch_size): # TD_target[i][T_actions[i]] = T_rewards[i] + self.discount_factor*(1. - T_dones[i]) *T_next_a[i] # TD_target = TD_target.detach() # self.optimizer.zero_grad() # cost = self.loss(T_q, TD_target).mean() # cost.backward()#Gradient calculation # self.optimizer.step()#Gradient update if self.num_train%self.model_update_interval == 0: self.target_model.load_state_dict(self.model.state_dict())
def choosememory(c): if c == 0: return ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device) else: return PERMemory(STACK_SIZE + 1, MEM_SIZE, device)
os.mkdir(SAVE_PREFIX) # 在"./models"创建目录 torch.manual_seed(new_seed()) # 将new_seed赋值给cpu的随机数种子 #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")#创建设备:GPU/CPU? device = torch.device("cpu") env = MyEnv(device) agent = Agent( # 根据预设参数初始化 env.get_action_dim(), # 返回3,三个动作:["NOOP", "RIGHT", "LEFT"] device, GAMMA, new_seed(), EPS_START, EPS_END, EPS_DECAY, ) memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device) # 循环队列,三者分别对应通道、容量、设备,容量为MEM_SIZE=100_000 #### Training #### obs_queue: deque = deque(maxlen=5) #创建观察队列 done = True progressive = tqdm( range(MAX_STEPS), total=MAX_STEPS, # 预期的迭代次数 ncols=50, # 可以自定义进度条的总长度 leave=False, unit="b") for step in progressive: #step=int if done: #新一轮游戏? observations, _, _ = env.reset() for obs in observations: