def prediction(model_fp, input_fp, output_fp, limit): model = FCN() model.load_state_dict(tor.load(model_fp)) model.cuda() dir_size = len(os.listdir(input_fp)) limit = limit if limit else float("inf") for i in range(dir_size): if i < limit: file_name = os.path.join(input_fp, "{:0>4}_sat.jpg".format(i)) img = plt.imread(file_name) img = np.moveaxis(img, 2, 0) img = tor.FloatTensor(np.array([img])) img_var = Variable(img).type(tor.FloatTensor).cuda() pred_img = model(img_var) pred_img = tor.max(pred_img, 1)[1] pred_img = pred_img.cpu().data.numpy() pred_img = np.moveaxis(pred_img, 0, 2) output_img = img_recovery(pred_img) scipy.misc.imsave( os.path.join(output_fp, "{:0>4}_mask.png".format(i)), output_img) else: break
class DQN: def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01, batch_size=128, device="cpu"): self.actions_count = 0 self.n_actions = n_actions self.device = device self.gamma = gamma self.epsilon = 0 self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay = epsilon_decay self.batch_size = batch_size self.policy_net = FCN(n_states, n_actions).to(self.device) self.target_net = FCN(n_states, n_actions).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # 不启用 BatchNormalization 和 Dropout self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.loss = 0 self.memory = ReplayBuffer(memory_capacity) def select_action(self, state): '''选择工作 Args: state [array]: 状态 Returns: [array]: 动作 ''' self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ math.exp(-1. * self.actions_count / self.epsilon_decay) self.actions_count += 1 if random.random() > self.epsilon: with torch.no_grad(): state = torch.tensor( [state], device=self.device, dtype=torch.float32 ) # 先转为张量便于丢给神经网络,state元素数据原本为float64;注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 q_value = self.policy_net( state ) # tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) action = q_value.max(1)[1].item() else: action = random.randrange(self.n_actions) return action def update(self): if len(self.memory) < self.batch_size: return state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( self.batch_size) state_batch = torch.tensor( state_batch, device=self.device, dtype=torch.float ) # 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]]) action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( 1) # 例如tensor([[1],...,[0]]) reward_batch = torch.tensor( reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1]) next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float) done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze( 1) # 将bool转为float然后转为张量 # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net q_values = self.policy_net(state_batch).gather( 1, action_batch) # 等价于self.forward # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = self.target_net(next_state_batch).max( 1)[0].detach() # tensor([ 0.0060, -0.0171,...,]) # Compute the expected Q values expected_q_values = reward_batch + self.gamma * next_state_values * ( 1 - done_batch[0]) # Compute Huber loss # self.loss = nn.MSELoss(q_values, expected_q_values.unsqueeze(1)) self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # Optimize the model self.optimizer.zero_grad( ) # zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls). self.loss.backward( ) # loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation. for param in self.policy_net.parameters(): # clip防止梯度爆炸 param.grad.data.clamp_(-1, 1) self.optimizer.step( ) # causes the optimizer to take a step based on the gradients of the parameters.
class DQN: def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01, batch_size=128, device="cpu"): self.actions_count = 0 self.n_actions = n_actions # 总的动作个数 self.device = device # 设备,cpu或gpu等 self.gamma = gamma # e-greedy 策略相关参数 self.epsilon = 0 self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay = epsilon_decay self.batch_size = batch_size self.policy_net = FCN(n_states, n_actions).to(self.device) self.target_net = FCN(n_states, n_actions).to(self.device) # target_net的初始模型参数完全复制policy_net self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # 不启用 BatchNormalization 和 Dropout # 可查parameters()与state_dict()的区别,前者require_grad=True self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.loss = 0 self.memory = ReplayBuffer(memory_capacity) def select_action(self, state): '''选择动作 Args: state [array]: [description] Returns: action [array]: [description] ''' self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ math.exp(-1. * self.actions_count / self.epsilon_decay) self.actions_count += 1 if random.random() > self.epsilon: with torch.no_grad(): # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor([state], device=self.device, dtype=torch.float32) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) q_value = self.policy_net(state) # tensor.max(1)返回每行的最大值以及对应的下标, # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() else: action = random.randrange(self.n_actions) return action def update(self): if len(self.memory) < self.batch_size: return # 从memory中随机采样transition state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( self.batch_size) # 转为张量 # 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]]) state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float) action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( 1) # 例如tensor([[1],...,[0]]) reward_batch = torch.tensor( reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1]) next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float) done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze( 1) # 将bool转为float然后转为张量 # 计算当前(s_t,a)对应的Q(s_t, a) # 关于torch.gather,对于a=torch.Tensor([[1,2],[3,4]]) # 那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]]) q_values = self.policy_net(state_batch).gather( dim=1, index=action_batch) # 等价于self.forward # 计算所有next states的V(s_{t+1}),即通过target_net中选取reward最大的对应states next_state_values = self.target_net(next_state_batch).max( 1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,]) # 计算 expected_q_value # 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward expected_q_values = reward_batch + self.gamma * \ next_state_values * (1-done_batch[0]) # self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算 均方误差loss # 优化模型 self.optimizer.zero_grad( ) # zero_grad清除上一步所有旧的gradients from the last step # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分 self.loss.backward() for param in self.policy_net.parameters(): # clip防止梯度爆炸 param.grad.data.clamp_(-1, 1) self.optimizer.step() # 更新模型 def save_model(): pass def load_model(): pass
class DQN: def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01, batch_size=128, device="cpu"): self.actions_count = 0 self.n_actions = n_actions # 总的动作个数 self.device = device # 设备,cpu或gpu等 self.gamma = gamma # e-greedy策略相关参数 self.epsilon = 0 self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay = epsilon_decay self.batch_size = batch_size self.policy_net = FCN(n_states, n_actions).to(self.device) self.target_net = FCN(n_states, n_actions).to(self.device) # target_net的初始模型参数完全复制policy_net self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # 不启用 BatchNormalization 和 Dropout # 可查parameters()与state_dict()的区别,前者require_grad=True self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.loss = 0 self.memory = ReplayBuffer(memory_capacity) def choose_action(self, state, train=True): '''选择动作 ''' if train: self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ math.exp(-1. * self.actions_count / self.epsilon_decay) self.actions_count += 1 if random.random() > self.epsilon: with torch.no_grad(): # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor([state], device=self.device, dtype=torch.float32) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) q_value = self.policy_net(state) # tensor.max(1)返回每行的最大值以及对应的下标, # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() else: action = random.randrange(self.n_actions) return action else: with torch.no_grad(): # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor([state], device='cpu', dtype=torch.float32) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) q_value = self.target_net(state) # tensor.max(1)返回每行的最大值以及对应的下标, # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() return action def update(self): if len(self.memory) < self.batch_size: return # 从memory中随机采样transition state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( self.batch_size) # 转为张量 # 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]]) state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float) action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( 1) # 例如tensor([[1],...,[0]]) reward_batch = torch.tensor( reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1]) next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float) done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze( 1) # 将bool转为float然后转为张量 # 计算当前(s_t,a)对应的Q(s_t, a) q_values = self.policy_net(state_batch) next_q_values = self.policy_net(next_state_batch) # 代入当前选择的action,得到Q(s_t|a=a_t) q_value = q_values.gather(dim=1, index=action_batch) '''以下是Nature DQN的q_target计算方式 # 计算所有next states的Q'(s_{t+1})的最大值,Q'为目标网络的q函数 next_q_state_value = self.target_net( next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,]) # 计算 q_target # 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0]) ''' '''以下是Double DQNq_target计算方式,与NatureDQN稍有不同''' next_target_values = self.target_net(next_state_batch) # 选出Q(s_t‘, a)对应的action,代入到next_target_values获得target net对应的next_q_value,即Q’(s_t|a=argmax Q(s_t‘, a)) next_target_q_value = next_target_values.gather( 1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) q_target = reward_batch + self.gamma * next_target_q_value * ( 1 - done_batch[0]) self.loss = nn.MSELoss()(q_value, q_target.unsqueeze(1)) # 计算 均方误差loss # 优化模型 self.optimizer.zero_grad( ) # zero_grad清除上一步所有旧的gradients from the last step # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分 self.loss.backward() for param in self.policy_net.parameters(): # clip防止梯度爆炸 param.grad.data.clamp_(-1, 1) self.optimizer.step() # 更新模型 def save_model(self, path): torch.save(self.target_net.state_dict(), path) def load_model(self, path): self.target_net.load_state_dict(torch.load(path))
class PolicyGradient: def __init__(self, state_dim, device='cpu', gamma=0.99, lr=0.01, batch_size=5): self.gamma = gamma self.policy_net = FCN(state_dim) self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=lr) self.batch_size = batch_size def choose_action(self, state): state = torch.from_numpy(state).float() state = Variable(state) probs = self.policy_net(state) m = Bernoulli(probs) action = m.sample() action = action.data.numpy().astype(int)[0] # 转为标量 return action def update(self, reward_pool, state_pool, action_pool): # Discount reward running_add = 0 # 就是那个有discount的公式 for i in reversed(range(len(reward_pool))): # 倒数 if reward_pool[i] == 0: running_add = 0 else: running_add = running_add * self.gamma + reward_pool[i] reward_pool[i] = running_add # 得到G # Normalize reward reward_mean = np.mean(reward_pool) reward_std = np.std(reward_pool) for i in range(len(reward_pool)): reward_pool[i] = (reward_pool[i] - reward_mean) / reward_std # 归一化 # Gradient Desent self.optimizer.zero_grad() for i in range(len(reward_pool)): # 从前往后 state = state_pool[i] action = Variable(torch.FloatTensor([action_pool[i]])) reward = reward_pool[i] state = Variable(torch.from_numpy(state).float()) probs = self.policy_net(state) m = Bernoulli(probs) # Negtive score function x reward loss = -m.log_prob(action) * reward # 核心 # print(loss) loss.backward() self.optimizer.step() def save_model(self, path): torch.save(self.policy_net.state_dict(), path) def load_model(self, path): self.policy_net.load_state_dict(torch.load(path))
num_workers=4) numClass = 8 numPlanes = 16 levels = 4 levelDepth = 2 kernelSize = 3 model = FCN(numPlanes, levels, levelDepth, numClass, kernelSize, 0.1) mapLoc = None if haveCuda else {'cuda:0': 'cpu'} if haveCuda: model = model.cuda() model.load_state_dict( torch.load(root + 'bestModelSeg.pth', map_location=mapLoc)) model.eval() for i, (images, labels) in enumerate(valloader): if torch.cuda.is_available(): images = images.cuda() pred = model(images) _, predClass = torch.max(pred, 1) #img = Image.fromarray(Colorize(predClass[0]).permute(1, 2, 0).numpy().astype('uint8')) orig = trBack(images[0].cpu()).numpy() img = Colorize(predClass[0]).numpy() img = (0.5 * img + 0.5 * orig).transpose(1, 2, 0) img = cv2.resize(img, dsize=None, fx=4, fy=4)