def update(self, lastState, lastAction, reward, state): if lastState is None: return self.initNN(state) self._memory.push( (torch.FloatTensor([lastState]), torch.LongTensor([[lastAction]]), torch.FloatTensor([reward]), torch.FloatTensor([state]))) if len(self._memory) < self._batchSize: return transitions = self._memory.sample(self._batchSize) batch_state, batch_action, batch_reward, batch_next_state = zip( *transitions) batch_state = torch.cat(batch_state) batch_action = torch.cat(batch_action) batch_reward = torch.cat(batch_reward) batch_next_state = torch.cat(batch_next_state) batch_prediction = self._NN(batch_state) batch_next_prediction = self._NN(batch_next_state) current_q_values = batch_prediction.gather(1, batch_action)[:, 0] max_next_q_values = batch_next_prediction.detach().max(1)[0] expected_q_values = ( 1.0 - self._alpha) * current_q_values + self._alpha * ( batch_reward + self._gamma * max_next_q_values) loss = F.smooth_l1_loss(current_q_values, expected_q_values) self._optimizer.zero_grad() loss.backward() self._optimizer.step()
def train(self, left_img, right_img, depth_img): """ :param left_img: left rgb camera image :param right_img: right rgb camera image :param depth_img: reverse depth format. 0 means infinite distance, 255 (or 1.0) may either be 255 pixels or ~0 cm from both cameras. If it's reversed, the depth mask must be changed. :return: """ self.model.train() self._adjust_learning_rate(self._train_calls) self._train_calls += 1 # todo: auto convert 0-255 to 0-1 img_l = Variable(torch.FloatTensor(left_img)) img_r = Variable(torch.FloatTensor(right_img)) true_disparity = Variable(torch.FloatTensor(depth_img)) if torch.cuda.is_available(): img_l, img_r, true_disparity = img_l.cuda(), img_r.cuda( ), true_disparity.cuda() finite_depth_mask = true_disparity > 0 finite_depth_mask.detach_() output1, output2, output3 = self.model(img_l, img_r) output1 = torch.squeeze(output1, 1) output2 = torch.squeeze(output2, 1) output3 = torch.squeeze(output3, 1) loss = (0.5 * F.smooth_l1_loss(output1[finite_depth_mask], true_disparity[finite_depth_mask], size_average=True) + 0.7 * F.smooth_l1_loss(output2[finite_depth_mask], true_disparity[finite_depth_mask], size_average=True) + F.smooth_l1_loss(output3[finite_depth_mask], true_disparity[finite_depth_mask], size_average=True)) loss.backward() self.optimizer.step() return loss.data[0]
def update(self, s, a, r, done, s_next): s = torch.tensor(s, device=device) # s = self.policy_net.preprocess(s) a = torch.tensor(a, device=device) r = torch.tensor(r, device=device) done = torch.tensor(done, device=device) s_next = torch.tensor(s_next, device=device) # s_next = self.policy_net.preprocess(s_next) if not self.ready: self.memory.add(Transition(s, a, r, done, s_next)) return # Using batch memory self.memory.add(Transition(s, a, r, done, s_next)) if isinstance(self.memory, WeightedMemory): tree_idx, batch, sample_weights = self.memory.sample(self.batch_size) sample_weights = torch.tensor(sample_weights, device=device) else: batch = self.memory.sample(self.batch_size) batch_t = Transition(*zip(*batch)) # transposed batch # Get expected Q values s_batch, a_batch, r_batch, done_batch, s_next_batch = batch_t s_batch = torch.cat(s_batch) a_batch = torch.stack(a_batch) r_batch = torch.stack(r_batch).view(-1, 1) s_next_batch = torch.cat(s_next_batch) done_batch = torch.stack(done_batch).view(-1, 1) q = self.state_action_value(s_batch, a_batch) # Get Actual Q values double_actions = self.policy_net(s_next_batch).max(1)[1].detach() # used for double q learning q_next = self.state_action_value(s_next_batch, double_actions) q_next_actual = (~done_batch) * q_next # Removes elements thx`at are done q_target = r_batch + self.gamma * q_next_actual ###TEST if clamping works or is even good practise q_target = q_target.clamp(-1, 1) ###/TEST if isinstance(self.memory, WeightedMemory): absolute_loss = torch.abs(q - q_target).detach().cpu().numpy() loss = weighted_smooth_l1_loss( q, q_target, sample_weights ) # TODO fix potential non-linearities using huber loss self.memory.batch_update(tree_idx, absolute_loss) else: loss = F.smooth_l1_loss(q, q_target) self.optim.zero_grad() loss.backward() for param in self.policy_net.parameters(): # see if this ends up doing anything - should just be relu param.grad.data.clamp_(-1, 1) self.optim.step()
def smooth_l1_loss(coord, label, loc): pos_mask = label > 0 pos_coord = coord[pos_mask] pos_loc = loc[pos_mask] loss = F.smooth_l1_loss(pos_loc, pos_coord, reduction='sum') return loss / pos_mask.sum()
def _optimize_model(self, model: torch.nn.Module, batch: np.ndarray, gamma: float, device: str): """ Sample batch from memory of environment transitions and train network to fit the temporal difference TD(0) Q-value approximation """ model.train() # Compute a mask of non-final states and concatenate the batch elements non_final_mask = [] non_final_idx = [] non_final_next_states = [] batch_size = len(batch) for idx, state in enumerate(batch.next_state): if state is not None: non_final_mask.append(True) non_final_idx.append(idx) non_final_next_states.append(state) else: non_final_mask.append(False) non_final_mask = torch.ByteTensor(non_final_mask) # non_final_idx = np.array(non_final_idx) non_final_next_states = torch.cat(non_final_next_states) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken state_action_values = model(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. next_state_values = (torch.zeros(batch_size).float().to(device) ) # zero for terminal states # what would the model predict next_state_values[non_final_mask] = model(non_final_next_states).max( 1)[0] with torch.no_grad(): expected_state_action_values = ( next_state_values * gamma) + reward_batch # compute the expected Q values loss = F.smooth_l1_loss( state_action_values.view(-1), expected_state_action_values.view(-1)) # compute Huber loss # optimize network optimizer.zero_grad() # optimize towards expected q-values loss.backward() for param in model.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step()
def calc_loss(self, coord, label, loc, conf): # step2. hard negative mining num_class = conf.size(2) coord = coord.view(-1, 4) loc = loc.view(-1, 4) label = label.view(-1) conf = conf.view(-1, num_class) # "positive" means label is not background pos_mask = label != 0 pos_conf = conf[pos_mask] pos_label = label[pos_mask] # sort background confidence by loss in descending order tmp = F.cross_entropy(conf, label, reduction='none') tmp[pos_mask] = 0. _, neg_indices = tmp.sort(descending=True) # pick num(positive_samples)*3 of negative samples per batch num_pos = pos_conf.size(0) num_neg = min(num_pos * 3, conf.size(0) - num_pos) neg_conf = conf[neg_indices[0:num_neg]] neg_label = label[neg_indices[0:num_neg]] conf = torch.cat([pos_conf, neg_conf], 0) label = torch.cat([pos_label, neg_label], 0) l_conf = F.cross_entropy(conf, label, reduction='sum') # - calc l_loc coord = coord[pos_mask] loc = loc[pos_mask] l_loc = F.smooth_l1_loss(loc, coord, reduction='sum') return (l_conf + self.alpha * l_loc) / num_pos
def calculate_loss(rewards, log_probabilities, values, entropies, config): # print(f'rewards: {rewards}, rewards type: {type(rewards)}') # print(f'log_probabilities: {log_probabilities}') # print(f'values: {values}') if len(rewards) <= 0: return None discounted_rewards = [] accumulated_rewards = 0 for current_reward in rewards[::-1]: accumulated_rewards = config.reward_discount * accumulated_rewards + current_reward discounted_rewards.append(accumulated_rewards) # print(f'discounted rewards: {discounted_rewards}') discounted_rewards = torch.tensor(discounted_rewards[::-1]).float().to( config.device) unbiased = True if len(discounted_rewards) > 1 else False # print(f'unbiased: {unbiased}') # print(f'std+eps rewards: std: {discounted_rewards.std(unbiased=unbiased)}, # {(discounted_rewards.std(unbiased=unbiased) + config.eps)}') normalized_rewards = (discounted_rewards - discounted_rewards.mean()) / \ (discounted_rewards.std(unbiased=unbiased) + config.eps) # print(f'normalized_rewards: {normalized_rewards}') policy_loss = [] value_loss = [] for reward, log_probability, value in zip(normalized_rewards, log_probabilities, values): policy_loss.append((reward - value) * -log_probability) value = value.squeeze(0).squeeze(0) value_loss.append(F.smooth_l1_loss(value, reward)) # print(f'policy_loss: {policy_loss}') # print(f'value_loss: {value_loss}') return torch.stack(policy_loss).sum() + 0.5 * torch.stack(value_loss).sum()
def update(self, agent): """ :param agent: The agent ID) """ if len(self.replay_buffer) < self.batch_size: return nbatch = self.replay_buffer.sample(self.batch_size, tensorize=True) batch = nbatch[agent] non_final_mask = torch.ByteTensor( tuple(map(lambda s: not s, batch.done))) non_final_next_states = torch.stack( [s for done, s in zip(batch.done, batch.next_states) if not done]) state_batch = batch.states action_batch = batch.actions reward_batch = batch.rewards state_action_values = ((self.policies[agent](state_batch) * action_batch).sum(dim=1).view(-1, 1)) next_state_values = torch.zeros(self.batch_size) best_actions = (self.policies[agent](non_final_next_states).argmax( 1).unsqueeze(-1)) next_state_values[non_final_mask] = ( self.policy_targets[agent](non_final_next_states).gather( dim=1, index=best_actions).squeeze().detach()) targets = (next_state_values * self.gamma) + reward_batch loss = F.smooth_l1_loss(state_action_values, targets.unsqueeze(1)) self.policy_optimizers[agent].zero_grad() loss.backward() for param in self.policies[agent].parameters(): param.grad.data.clamp_(-self.grad_clip, self.grad_clip) self.policy_optimizers[agent].step()
def forward(self, predictions, targets): """ 損失関数の計算 Args: predictions: SSD netの訓練時の出力(tuple) loc=torch.Size([num_batch, 8732, 4]), conf=torch.Size([num_batch, 8732, 21]), dbox_list=torch.Size([8732, 4]) targets: [num_batch, num_jobs, 5] 5は正解アノテーション情報[xmin, ymin, xmax, ymax, label_index]を示す Returns: loss_l: locの損失値 SmoothL1Loss loss_c: confの損失値 CrossEntropyLoss """ loc_data, conf_data, dbox_list = predictions # print("loc_data size: ", loc_data.size()) num_batch = loc_data.size(0) # ミニバッチ数(*) num_dbox = loc_data.size(1) # DBox数(8732) num_classes = conf_data.size(2) # クラス数(21) # 損失計算に使用する変数 # conf_t_label: 各DBoxに、一番近い正解のBBoxのラベルを格納 8732 # loc_t: 各DBoxに、一番近いBBoxのいち情報を格納 8732 conf_t_label = torch.LongTensor(num_batch, num_dbox).to(self.device) # torch.long loc_t = torch.Tensor(num_batch, num_dbox, 4).to(self.device) # Tensorはtorch.float32 # print("loc_t size: ", loc_t.size()) # print("conf_t_label size: ", conf_t_label.size()) # loc_tとconf_t_labelに, DBoxと正解アノテーションtargets(BBox)をmatchさせた結果を上書きする for idx in range(num_batch): truths_loc = targets[idx][:, :-1].to(self.device) # BBox labels_conf = targets[idx][:, -1].to(self.device) # Labels # print("truths_loc size: ", truths_loc.size()) # print("labels_conf size: ", labels_conf) dbox = dbox_list.to(self.device) # 関数matchを実行し、loc_tとconf_t_labelの内容を更新する # (詳細) # loc_t: 各DBoxに、一番近い正解のBBoxの位置情報が上書きされる # conf_t_label: 各DBoxに、一番近い正解のBBoxのラベルが上書きされる # ただし、一番近いBBoxとのjaccard係数が0.5より小さい場合は、正解BBoxのconf_t_labelは背景クラス0とする variance = [0.1, 0.2] # loc_t[idx], conf_t_label[idx] = match(self.jaccard_thresh, truths_loc, dbox, variance, labels_conf) match(self.jaccard_thresh, truths_loc, dbox, variance, labels_conf, loc_t, conf_t_label, idx) # ここで、 # loc_tは8732個の要素のうち、Positive DBoxに該当する数だけ有効な数値が入る # conf_t_labelは8732個の要素数は変わらず、Positive DBoxはtarget BBoxのクラスラベルが入り、Negative DBoxは背景(0)になる # ----- # 位置の損失:loss_l # Smooth L1関数 # ただし物体を発見したDBoxのオフセットのみを計算する # ----- # 物体を検出したDBox(Positive DBox)を取り出すマスク pos_mask = conf_t_label > 0 # torch.Size([num_batch, 8732]) # torch.Size([num_batch, 8732]) -> torch.Size([num_batch, 8732, 4]) pos_idx = pos_mask.unsqueeze(pos_mask.dim()).expand_as(loc_data) # Positive DBoxのloc_data(位置補正情報の推論値)と教師データloc_tを取得 loc_p = loc_data[pos_idx].view( -1, 4) # Boolean Indexによる抽出後は必ず、1次元配列になるので、形状を変更する loc_t = loc_t[pos_idx].view(-1, 4) # 物体を発見したPositive DBoxのオフセット情報loc_tの損失(誤差)を計算 loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum') # print("loc_p", loc_p) # print("loc_t", loc_t) # print("loss_l", loss_l) # ----- # クラス予測の損失: loss_c # 交差エントロピー誤差関数 # 背景クラスが正解のDBoxが圧倒的に多いので、Hard Negative Miningを実施し、 # 物体発見DBoxと背景クラスDBoxの比が1:3になるようにする。 # 背景クラスDBoxと予想したもののうち、損失が小さいものはクラス予測の損失から除く # ----- batch_conf = conf_data.view( -1, num_classes) # (batch_num,8732,21) -> (batch_num*8732,21) # print("batch_conf", batch_conf) # print("batch_conf size: ", batch_conf.size()) # クラス予測の損失関数を計算(reduction='none'にして、和を取らずに次元を潰さない) # batch_conf size: (batch_num*8732,21), conf_t_label size: (batch_num*8732,) loss_c = F.cross_entropy(batch_conf, conf_t_label.view(-1), reduction='none') # 一旦、すべてのDBoxに対して損失を計算 # loss_c: (batch_num * 8732,) # ----- # Negative DBoxのうち, Hard Negative Miningで抽出するものを求めるマスクを作成 # ----- # 物体を発見したPositive DBoxの損失を0にする # (注意) 物体はlabelが1以上.0は背景 num_pos = pos_mask.long().sum( dim=1, keepdim=True ) # 各入力データ(画像)毎のPositive Boxの数を取得 (batch_num, 8732) -> (batch_num, 1) loss_c = loss_c.view(num_batch, -1) # torch.Size([num_batch, 8732]) loss_c[pos_mask] = 0 # 物体を発見したDBoxに対応する損失は0にする # Hard Negative Miningの実行 """各DBoxの損失の大きさloss_cの順位であるidx_rankを求める""" _, loss_idx = loss_c.sort(dim=1, descending=True) # 損失に基づいて各DBox(8732)を降順にソート _, idx_rank = loss_idx.sort(dim=1) # loss_rankは、DBoxの損失を降順にソートした時の元配列のインデックスの並び """ (注釈) 上2行の実装コードは特殊で直感的でない。 やりたいことは、各DBoxに対して、損失の大きさが何番目なのかの情報をidx_rankとして高速に取得する。 DBoxの損失値の大きい方から降順に並べ、DBoxの降順のindexをloss_idxに格納。 損失の大きさloss_cの順位であるidx_rankを求める。 ここで、 降順になった配列indexであるloss_idxを0~8732までの昇順で並べ直すためには、 何番目のloss_idxのインデックスを取ってきたら良いかを示すのが、idx_rankである。 例えば、 idx_rankの要素0番目 = idx_rank[0]を求めるには、loss_idxの値が0の要素、つまり loss_idx[?] = 0の?は何番目かを求めることになる。ここで、? = idx_rank[0]である。 いま、loss_idx[?] = 0の0は、元のloss_cの要素の0番目という意味である。 つまり、?は、元のloss_cの要素0番目は、降順に並び替えられたloss_idxの何番目ですか を求めていることになり、結果、? = idx_rank[0]はloss_cの要素0番目が降順の何番目かを示す。 e.g loss_c 3.2 5.8 1.3 2.5 4.0 sorted_loss_c 5.8 4.0 3.2 2.5 1.3 descending_of_loss_c_index 1 4 0 3 2 (loss_idx) sorted_loss_idx 0 1 2 3 4 ascending_of_loss_idx 2 0 4 3 1 (idx_rank) """ # 背景のDBoxの数num_negを決める。Hard Negative Miningにより、物体を発見したDBoxの数num_posの3倍(self.negpos_ratio)とする。 # 万が一、DBoxの数を超える場合は、DBoxを上限とする num_neg = torch.clamp(num_pos * self.negpos_ratio, max=num_dbox) # 背景のDBoxの数num_negよりも順位が低い(損失が大きい)DBoxを抽出するマスク neg_mask = idx_rank < num_neg.expand_as(idx_rank) # ----- # (終了) # ----- # Negative DBoxのうち、Hard Negative Miningで抽出するものを求めるマスクを作成 # pos_mask: torch.Size([num_batch, 8732]) -> pos_idx_mask: torch.Size([num_batch, 8732, 21]) pos_idx_mask = pos_mask.unsqueeze(2).expand_as(conf_data) neg_idx_mask = neg_mask.unsqueeze(2).expand_as(conf_data) # posとnegだけを取り出してconf_hnmにする。torch.Size([num_pos + num_neg, 21]) # gtは greater than (>)の略称。これでmaskが1のindexを取り出す。 conf_hnm = conf_data[(pos_idx_mask + neg_idx_mask).gt(0)].view( -1, num_classes) # posとnegだけのconf_t_label torch.Size([pos + neg]) conf_t_label_hnm = conf_t_label[(pos_mask + neg_mask).gt(0)] # confidenceの損失関数を計算 loss_c = F.cross_entropy(conf_hnm, conf_t_label_hnm, reduction='sum') # print("conf_hnm", conf_hnm) # print("conf_t_label_num", conf_t_label_hnm) # print("loss_c", loss_c) # 物体を発見したBBoxの数N(全ミニバッチの合計)で損失を割り算 N = num_pos.sum() loss_l /= N loss_c /= N return loss_l, loss_c
def train_agent(self, memory_buffer): states, actions, log_probs_old, returns, advantages = memory_buffer.cat( ['s', 'a', 'log_pi_a', 'ret', 'adv']) actions = actions.detach() log_probs_old = log_probs_old.detach() advantages = (advantages - advantages.mean()) / advantages.std() sum_returns = 0 sum_advantage = 0 sum_policy_loss = 0 sum_critic_loss = 0 sum_entropy = 0 batch_steps = 0 # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.opt, T_max=config.optimization_epochs, eta_min=self.min_lr) self.network.train() config = self.config for ep in range(config.optimization_epochs): sampler = random_sample(np.arange(states.size(0)), config.mini_batch_size) for batch_indices in sampler: batch_indices = tensor(batch_indices).long() sampled_states = states[batch_indices] sampled_actions = actions[batch_indices] sampled_log_probs_old = log_probs_old[batch_indices] sampled_returns = returns[batch_indices] sampled_advantages = advantages[batch_indices] #this activates only part of the network responsible for V and log_policy #actions in this case are already provided and won't be calculated! prediction = self.network(sampled_states.cuda(), sampled_actions.cuda()) #ratio is a diff between old and newly calcualted policy ratio = (prediction['log_pi_a'] - sampled_log_probs_old).exp() obj = ratio * sampled_advantages # gradient clip (1 - epsilon / 1 + epsilon happens here) obj_clipped = ratio.clamp( 1.0 - self.config.ppo_ratio_clip, 1.0 + self.config.ppo_ratio_clip) * sampled_advantages # entropy_weight is a factor for entropy boost - it should be set to 0 once the training stabilises policy_loss = torch.min(obj, obj_clipped).mean( ) + config.entropy_weight * prediction['ent'].mean() # Huber loss value_loss = F.smooth_l1_loss(prediction['v'], sampled_returns.view(-1, 1)) sum_returns, sum_advantage, sum_policy_loss, sum_critic_loss, sum_entropy = \ self.log_stats(sampled_returns, sampled_advantages, policy_loss, value_loss, prediction['ent'].mean(), batch_steps, sum_returns, sum_advantage, sum_critic_loss, sum_policy_loss, sum_entropy) batch_steps += 1 self.opt.zero_grad() (-(policy_loss - value_loss)).backward() nn.utils.clip_grad_norm_(self.network.parameters(), config.gradient_clip) self.opt.step() # lr_scheduler.step() return batch_steps
def smooth_l1_loss(coord, loc): loss = F.smooth_l1_loss(loc, coord, reduction='sum') return loss