class DQN: def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01, batch_size=128, device="cpu"): self.actions_count = 0 self.n_actions = n_actions self.device = device self.gamma = gamma self.epsilon = 0 self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay = epsilon_decay self.batch_size = batch_size self.policy_net = FCN(n_states, n_actions).to(self.device) self.target_net = FCN(n_states, n_actions).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # 不启用 BatchNormalization 和 Dropout self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.loss = 0 self.memory = ReplayBuffer(memory_capacity) def select_action(self, state): '''选择工作 Args: state [array]: 状态 Returns: [array]: 动作 ''' self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ math.exp(-1. * self.actions_count / self.epsilon_decay) self.actions_count += 1 if random.random() > self.epsilon: with torch.no_grad(): state = torch.tensor( [state], device=self.device, dtype=torch.float32 ) # 先转为张量便于丢给神经网络,state元素数据原本为float64;注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 q_value = self.policy_net( state ) # tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) action = q_value.max(1)[1].item() else: action = random.randrange(self.n_actions) return action def update(self): if len(self.memory) < self.batch_size: return state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( self.batch_size) state_batch = torch.tensor( state_batch, device=self.device, dtype=torch.float ) # 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]]) action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( 1) # 例如tensor([[1],...,[0]]) reward_batch = torch.tensor( reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1]) next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float) done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze( 1) # 将bool转为float然后转为张量 # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net q_values = self.policy_net(state_batch).gather( 1, action_batch) # 等价于self.forward # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = self.target_net(next_state_batch).max( 1)[0].detach() # tensor([ 0.0060, -0.0171,...,]) # Compute the expected Q values expected_q_values = reward_batch + self.gamma * next_state_values * ( 1 - done_batch[0]) # Compute Huber loss # self.loss = nn.MSELoss(q_values, expected_q_values.unsqueeze(1)) self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # Optimize the model self.optimizer.zero_grad( ) # zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls). self.loss.backward( ) # loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation. for param in self.policy_net.parameters(): # clip防止梯度爆炸 param.grad.data.clamp_(-1, 1) self.optimizer.step( ) # causes the optimizer to take a step based on the gradients of the parameters.
class DQN: def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01, batch_size=128, device="cpu"): self.actions_count = 0 self.n_actions = n_actions # 总的动作个数 self.device = device # 设备,cpu或gpu等 self.gamma = gamma # e-greedy 策略相关参数 self.epsilon = 0 self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay = epsilon_decay self.batch_size = batch_size self.policy_net = FCN(n_states, n_actions).to(self.device) self.target_net = FCN(n_states, n_actions).to(self.device) # target_net的初始模型参数完全复制policy_net self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # 不启用 BatchNormalization 和 Dropout # 可查parameters()与state_dict()的区别,前者require_grad=True self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.loss = 0 self.memory = ReplayBuffer(memory_capacity) def select_action(self, state): '''选择动作 Args: state [array]: [description] Returns: action [array]: [description] ''' self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ math.exp(-1. * self.actions_count / self.epsilon_decay) self.actions_count += 1 if random.random() > self.epsilon: with torch.no_grad(): # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor([state], device=self.device, dtype=torch.float32) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) q_value = self.policy_net(state) # tensor.max(1)返回每行的最大值以及对应的下标, # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() else: action = random.randrange(self.n_actions) return action def update(self): if len(self.memory) < self.batch_size: return # 从memory中随机采样transition state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( self.batch_size) # 转为张量 # 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]]) state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float) action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( 1) # 例如tensor([[1],...,[0]]) reward_batch = torch.tensor( reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1]) next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float) done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze( 1) # 将bool转为float然后转为张量 # 计算当前(s_t,a)对应的Q(s_t, a) # 关于torch.gather,对于a=torch.Tensor([[1,2],[3,4]]) # 那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]]) q_values = self.policy_net(state_batch).gather( dim=1, index=action_batch) # 等价于self.forward # 计算所有next states的V(s_{t+1}),即通过target_net中选取reward最大的对应states next_state_values = self.target_net(next_state_batch).max( 1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,]) # 计算 expected_q_value # 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward expected_q_values = reward_batch + self.gamma * \ next_state_values * (1-done_batch[0]) # self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算 均方误差loss # 优化模型 self.optimizer.zero_grad( ) # zero_grad清除上一步所有旧的gradients from the last step # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分 self.loss.backward() for param in self.policy_net.parameters(): # clip防止梯度爆炸 param.grad.data.clamp_(-1, 1) self.optimizer.step() # 更新模型 def save_model(): pass def load_model(): pass
class DQN: def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01, batch_size=128, device="cpu"): self.actions_count = 0 self.n_actions = n_actions # 总的动作个数 self.device = device # 设备,cpu或gpu等 self.gamma = gamma # e-greedy策略相关参数 self.epsilon = 0 self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay = epsilon_decay self.batch_size = batch_size self.policy_net = FCN(n_states, n_actions).to(self.device) self.target_net = FCN(n_states, n_actions).to(self.device) # target_net的初始模型参数完全复制policy_net self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # 不启用 BatchNormalization 和 Dropout # 可查parameters()与state_dict()的区别,前者require_grad=True self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.loss = 0 self.memory = ReplayBuffer(memory_capacity) def choose_action(self, state, train=True): '''选择动作 ''' if train: self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ math.exp(-1. * self.actions_count / self.epsilon_decay) self.actions_count += 1 if random.random() > self.epsilon: with torch.no_grad(): # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor([state], device=self.device, dtype=torch.float32) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) q_value = self.policy_net(state) # tensor.max(1)返回每行的最大值以及对应的下标, # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() else: action = random.randrange(self.n_actions) return action else: with torch.no_grad(): # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor([state], device='cpu', dtype=torch.float32) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) q_value = self.target_net(state) # tensor.max(1)返回每行的最大值以及对应的下标, # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() return action def update(self): if len(self.memory) < self.batch_size: return # 从memory中随机采样transition state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( self.batch_size) # 转为张量 # 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]]) state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float) action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( 1) # 例如tensor([[1],...,[0]]) reward_batch = torch.tensor( reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1]) next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float) done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze( 1) # 将bool转为float然后转为张量 # 计算当前(s_t,a)对应的Q(s_t, a) q_values = self.policy_net(state_batch) next_q_values = self.policy_net(next_state_batch) # 代入当前选择的action,得到Q(s_t|a=a_t) q_value = q_values.gather(dim=1, index=action_batch) '''以下是Nature DQN的q_target计算方式 # 计算所有next states的Q'(s_{t+1})的最大值,Q'为目标网络的q函数 next_q_state_value = self.target_net( next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,]) # 计算 q_target # 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0]) ''' '''以下是Double DQNq_target计算方式,与NatureDQN稍有不同''' next_target_values = self.target_net(next_state_batch) # 选出Q(s_t‘, a)对应的action,代入到next_target_values获得target net对应的next_q_value,即Q’(s_t|a=argmax Q(s_t‘, a)) next_target_q_value = next_target_values.gather( 1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) q_target = reward_batch + self.gamma * next_target_q_value * ( 1 - done_batch[0]) self.loss = nn.MSELoss()(q_value, q_target.unsqueeze(1)) # 计算 均方误差loss # 优化模型 self.optimizer.zero_grad( ) # zero_grad清除上一步所有旧的gradients from the last step # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分 self.loss.backward() for param in self.policy_net.parameters(): # clip防止梯度爆炸 param.grad.data.clamp_(-1, 1) self.optimizer.step() # 更新模型 def save_model(self, path): torch.save(self.target_net.state_dict(), path) def load_model(self, path): self.target_net.load_state_dict(torch.load(path))
class PolicyGradient: def __init__(self, state_dim, device='cpu', gamma=0.99, lr=0.01, batch_size=5): self.gamma = gamma self.policy_net = FCN(state_dim) self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=lr) self.batch_size = batch_size def choose_action(self, state): state = torch.from_numpy(state).float() state = Variable(state) probs = self.policy_net(state) m = Bernoulli(probs) action = m.sample() action = action.data.numpy().astype(int)[0] # 转为标量 return action def update(self, reward_pool, state_pool, action_pool): # Discount reward running_add = 0 # 就是那个有discount的公式 for i in reversed(range(len(reward_pool))): # 倒数 if reward_pool[i] == 0: running_add = 0 else: running_add = running_add * self.gamma + reward_pool[i] reward_pool[i] = running_add # 得到G # Normalize reward reward_mean = np.mean(reward_pool) reward_std = np.std(reward_pool) for i in range(len(reward_pool)): reward_pool[i] = (reward_pool[i] - reward_mean) / reward_std # 归一化 # Gradient Desent self.optimizer.zero_grad() for i in range(len(reward_pool)): # 从前往后 state = state_pool[i] action = Variable(torch.FloatTensor([action_pool[i]])) reward = reward_pool[i] state = Variable(torch.from_numpy(state).float()) probs = self.policy_net(state) m = Bernoulli(probs) # Negtive score function x reward loss = -m.log_prob(action) * reward # 核心 # print(loss) loss.backward() self.optimizer.step() def save_model(self, path): torch.save(self.policy_net.state_dict(), path) def load_model(self, path): self.policy_net.load_state_dict(torch.load(path))
def train(data_loader, model_index, x_eval_train, y_eval_train): ### Model Initiation fcn = FCN() #print (fcn.b_1_conv_1[0].weight.data) d = tor.load("./models/vgg16_pretrained.pkl") fcn.vgg16_load(d) #d = tor.load("./models/fcn_model_1_1.pkl") #fcn.load_state_dict(d) fcn.cuda() #loss_func = tor.nn.CrossEntropyLoss(weight=w) loss_func = tor.nn.CrossEntropyLoss() #loss_func = tor.nn.MSELoss() #optim = tor.optim.SGD(fcn.parameters(), lr=LR, momentum=MOMENTUM) optim1 = tor.optim.Adam(fcn.b_6_conv_1.parameters(), lr=LR) optim2 = tor.optim.Adam(fcn.b_6_conv_2.parameters(), lr=LR) optim3 = tor.optim.Adam(fcn.b_6_conv_3.parameters(), lr=LR) optim4 = tor.optim.Adam(fcn.b_7_trans_1.parameters(), lr=LR) optim = tor.optim.Adam(fcn.parameters(), lr=LR) ### Training for epoch in range(EPOCH): print("|Epoch: {:>4} |".format(epoch + 1), end="") ### Training for step, (x_batch, y_batch) in enumerate(data_loader): x = Variable(x_batch).type(tor.FloatTensor).cuda() y = Variable(y_batch).type(tor.LongTensor).cuda() pred = fcn(x) optim1.zero_grad() optim2.zero_grad() optim3.zero_grad() optim4.zero_grad() optim.zero_grad() loss = loss_func(pred, y) loss.backward() optim1.step() optim2.step() optim3.step() optim4.step() print(pred[:2]) print(tor.max(pred[:5], 1)[1]) ### Evaluation loss = float(loss.data) acc = evaluate(fcn, x_eval_train, y_eval_train) print("|Loss: {:<8} |Acc: {:<8}".format(loss, acc)) ### Save model if epoch % RECORD_MODEL_PERIOD == 0: tor.save( fcn.state_dict(), os.path.join(MODEL_ROOT, "fcn_model_{}_{}.pkl".format(model_index, epoch))) ### Record record_data = dict() if epoch == 0: record_data["model_name"] = "fcn_model_{}.pkl".format(model_index) record_data["data_size"] = AVAILABLA_SIZE record_data["batch_size"] = BATCHSIZE record_data["decay"] = str((LR_STEPSIZE, LR_GAMMA)) record_data["lr_init"] = float(optim1.param_groups[0]["lr"]) record_data["lr"] = float(optim1.param_groups[0]["lr"]) record_data["record_epoch"] = RECORD_MODEL_PERIOD record_data["loss"] = loss record_data["acc"] = acc else: record_data["model_name"] = "fcn_model_{}.pkl".format(model_index) record_data["lr"] = float(optim1.param_groups[0]["lr"]) record_data["loss"] = loss record_data["acc"] = acc record(RECORD_FP, record_data)
for predIdx in range(numClass): conf[(predIdx, labIdx)] /= (labCnts[labIdx] / 100.0) meanClassAcc = 0.0 meanIoU = torch.sum(IoU / imgCnt).item() / numClass * 100 currLoss = running_loss / (i + 1) for j in range(numClass): meanClassAcc += conf[(j, j)] / numClass print( "Epoch [%d] Validation Loss: %.4f Validation Pixel Acc: %.2f Mean Class Acc: %.2f IoU: %.2f" % (epoch + 1, running_loss / (i + 1), running_acc / (imgCnt), meanClassAcc, meanIoU)) ploter.plot("loss", "val", epoch + 1, running_loss / (i + 1)) if bestLoss > currLoss: conf[conf < 0.001] = 0 print(conf) bestConf = conf bestLoss = currLoss bestIoU = meanIoU bestAcc = meanClassAcc bestTAcc = running_acc / (imgCnt) torch.save(model.state_dict(), root + "bestModelSeg.pth") scheduler.step(currLoss) print( "Optimization finished Validation Loss: %.4f Pixel Acc: %.2f Mean Class Acc: %.2f IoU: %.2f" % (bestLoss, bestTAcc, bestAcc, bestIoU)) print(bestConf)