def forward(self, vocab): with torch.no_grad(): batch_shape = vocab['sentence'].shape s_embedding = self.embedding(vocab['sentence'].cuda()) a_embedding = self.embedding(vocab['aspect'].cuda()) packed_s = pack_padded_sequence(s_embedding, vocab['sent_len'], batch_first=True) out_s, (h_s, c1) = self.lstm_s(packed_s) # packed output out_a, (h_a, c2) = self.lstm_a(a_embedding) with torch.no_grad(): unpacked_out_s, _ = pad_packed_sequence(out_s, batch_first=True) # Pair-wise interaction matrix I_matrix = torch.bmm(unpacked_out_s, out_a.permute(0,2,1)) # Column-wise softmax a2s_attn = F.softmax(I_matrix, dim=1) # Row-wise softmax => Column-wise average => aspect attention s2a_attn = F.softmax(I_matrix, dim=2) a_attn = torch.mean(s2a_attn, dim=1) # Final sentence attn => weighted sum of each individual a2s_attn s_attn = torch.bmm(a2s_attn, a_attn.unsqueeze(-1)) final_rep = torch.bmm(unpacked_out_s.permute(0,2,1), s_attn).squeeze(-1) pred = self.fc(final_rep) return pred
def softmax(tensor): r""" Wrapper around softmax to make it work with both Tensors and Variables. TODO: Remove once https://github.com/pytorch/pytorch/issues/2633 is resolved. """ if not isinstance(tensor, Variable): return F.softmax(Variable(tensor), -1).data return F.softmax(tensor, -1)
def train(model,trainLoader,criterion, optimizer,evalData = None, epoch=1,echoStep=100,evalStep=1000,saveStep=5000,savePath="./"): if evalData != None: evalX,evalY = evalData if torch.cuda.is_available(): evalY = evalY.cuda() if isinstance (evalX,list): for ti,t in enumerate(evalX): evalX[ti] = evalX[ti].cuda() else: evalX = evalX.cuda() batchLen = len(trainLoader) for epochIdx in xrange(epoch): for i,batch in enumerate(trainLoader,batchLen * epochIdx + 1): x, y = batch if torch.cuda.is_available(): y = y.cuda() if isinstance (x,list): for ti,t in enumerate(x): x[ti] = x[ti].cuda() else: x = x.cuda() out = model(x) loss = criterion(out, y) prob = F.softmax(out, 1) pred = torch.argmax(out, dim=1) correct = pred.eq(y).sum() acc = float(correct) / len(y) #print loss if i % echoStep == 0: print "Step %d/%d/%d : Loss %.4f , Acc %.4f " %(i,batchLen*epoch,epochIdx+1,float(loss),acc) #evaluate if i % evalStep == 0 and evalData != None: evalOut = model(evalX) evalLoss = criterion(evalOut, evalY) correct = torch.argmax(F.softmax(evalOut, 1) , dim=1).eq(evalY).sum() evalAcc = float(correct) / len(evalY) print "------------------------------------------------" print "Evaluate %d Sample : Loss %.4f , Acc %.4f " %(evalY.size(0),float(evalLoss),evalAcc) print #save model if i % saveStep == 0: outFile = "%s/m_%d_%d.pt" %(savePath,i,epochIdx+1) torch.save(model.state_dict(),outFile) print "Save model : %s" %(outFile) #backward optimizer.zero_grad() loss.backward() optimizer.step() outFile = "%s/final.pt" %(savePath) torch.save(model.state_dict(),outFile) print "Save model : %s" %(outFile)
def validate(eval_loader, model, log, global_step, epoch): class_criterion = nn.CrossEntropyLoss(size_average=False, ignore_index=NO_LABEL).cuda() meters = AverageMeterSet() # switch to evaluate mode model.eval() end = time.time() for i, (input, target) in enumerate(eval_loader): meters.update('data_time', time.time() - end) input_var = torch.autograd.Variable(input, volatile=True) target_var = torch.autograd.Variable(target.cuda(async=True), volatile=True) minibatch_size = len(target_var) labeled_minibatch_size = target_var.data.ne(NO_LABEL).sum() assert labeled_minibatch_size > 0 meters.update('labeled_minibatch_size', labeled_minibatch_size) # compute output output1, output2 = model(input_var) softmax1, softmax2 = F.softmax(output1, dim=1), F.softmax(output2, dim=1) class_loss = class_criterion(output1, target_var) / minibatch_size # measure accuracy and record loss prec1, prec5 = accuracy(output1.data, target_var.data, topk=(1, 5)) meters.update('class_loss', class_loss.data[0], labeled_minibatch_size) meters.update('top1', prec1[0], labeled_minibatch_size) meters.update('error1', 100.0 - prec1[0], labeled_minibatch_size) meters.update('top5', prec5[0], labeled_minibatch_size) meters.update('error5', 100.0 - prec5[0], labeled_minibatch_size) # measure elapsed time meters.update('batch_time', time.time() - end) end = time.time() if i % args.print_freq == 0: LOG.info( 'Test: [{0}/{1}]\t' 'Time {meters[batch_time]:.3f}\t' 'Data {meters[data_time]:.3f}\t' 'Class {meters[class_loss]:.4f}\t' 'Prec@1 {meters[top1]:.3f}\t' 'Prec@5 {meters[top5]:.3f}'.format( i, len(eval_loader), meters=meters)) LOG.info(' * Prec@1 {top1.avg:.3f}\tPrec@5 {top5.avg:.3f}' .format(top1=meters['top1'], top5=meters['top5'])) log.record(epoch, { 'step': global_step, **meters.values(), **meters.averages(), **meters.sums() }) return meters['top1'].avg
def forward(self, x): x = F.relu(self.lin1(x)) out = self.head(x) #print(out) splits = out.view(x.size()[0],2,9).chunk(2,1) #print(splits[1]) #return torch.stack(list(map(lambda s: F.softmax(s[0]), splits)), 0) #print(F.softmax(splits[0]).view(x.size()[0],9)) print(torch.sum(F.softmax(splits[0]).view(x.size()[0],9),dim=1)) return F.softmax(splits[0]),F.softmax(splits[1])
def softmax_mse_loss(input_logits, target_logits): """Takes softmax on both sides and returns MSE loss Note: - Returns the sum over all examples. Divide by the batch size afterwards if you want the mean. - Sends gradients to inputs but not the targets. """ assert input_logits.size() == target_logits.size() input_softmax = F.softmax(input_logits, dim=1) target_softmax = F.softmax(target_logits, dim=1) num_classes = input_logits.size()[1] return F.mse_loss(input_softmax, target_softmax, size_average=False) / num_classes
def _region_proposal(self, net_conv_level1, net_conv_level2, net_conv_level3): if cfg.NUM_ANCHORS_LEVEL1 != 0: rpn_level1 = F.relu(self.rpn_net_level1(net_conv_level1)) # batch x w x h x l x (num_anchors x 6) rpn_bbox_pred_level1 = self.rpn_bbox_pred_net_level1(rpn_level1).permute(0, 2, 3, 4, 1).contiguous() # batch x 2 x w x h x l x num_anchors rpn_cls_score_level1 = self.rpn_cls_score_net_level1(rpn_level1).view(self.batch_size, 2, cfg.NUM_ANCHORS_LEVEL1, rpn_bbox_pred_level1.size(1), rpn_bbox_pred_level1.size(2), rpn_bbox_pred_level1.size(3)).permute(0, 1, 3, 4, 5, 2).contiguous() # batch x 2 x w x h x l x num_anchors rpn_cls_prob_level1 = F.softmax(rpn_cls_score_level1) self._predictions["rpn_cls_score_level1"] = rpn_cls_score_level1 self._predictions["rpn_cls_prob_level1"] = rpn_cls_prob_level1 self._predictions["rpn_bbox_pred_level1"] = rpn_bbox_pred_level1 if cfg.NUM_ANCHORS_LEVEL2 != 0: rpn_level2 = F.relu(self.rpn_net_level2(net_conv_level2)) # batch x w x h x l x (num_anchors x 6) rpn_bbox_pred_level2 = self.rpn_bbox_pred_net_level2(rpn_level2).permute(0, 2, 3, 4, 1).contiguous() # batch x 2 x w x h x l x num_anchors rpn_cls_score_level2 = self.rpn_cls_score_net_level2(rpn_level2).view(self.batch_size, 2, cfg.NUM_ANCHORS_LEVEL2, rpn_bbox_pred_level2.size(1), rpn_bbox_pred_level2.size(2), rpn_bbox_pred_level2.size(3)).permute(0, 1, 3, 4, 5, 2).contiguous() # batch x 2 x w x h x l x num_anchors rpn_cls_prob_level2 = F.softmax(rpn_cls_score_level2) self._predictions["rpn_cls_score_level2"] = rpn_cls_score_level2 self._predictions["rpn_cls_prob_level2"] = rpn_cls_prob_level2 self._predictions["rpn_bbox_pred_level2"] = rpn_bbox_pred_level2 if cfg.NUM_ANCHORS_LEVEL3 != 0: rpn_level3 = F.relu(self.rpn_net_level3(net_conv_level3)) # batch x w x h x l x (num_anchors x 6) rpn_bbox_pred_level3 = self.rpn_bbox_pred_net_level3(rpn_level3).permute(0, 2, 3, 4, 1).contiguous() # batch x 2 x w x h x l x num_anchors rpn_cls_score_level3 = self.rpn_cls_score_net_level3(rpn_level3).view(self.batch_size, 2, cfg.NUM_ANCHORS_LEVEL3, rpn_bbox_pred_level3.size(1), rpn_bbox_pred_level3.size(2), rpn_bbox_pred_level3.size(3)).permute(0, 1, 3, 4, 5, 2).contiguous() # batch x 2 x w x h x l x num_anchors rpn_cls_prob_level3 = F.softmax(rpn_cls_score_level3) self._predictions["rpn_cls_score_level3"] = rpn_cls_score_level3 self._predictions["rpn_cls_prob_level3"] = rpn_cls_prob_level3 self._predictions["rpn_bbox_pred_level3"] = rpn_bbox_pred_level3 if self._mode == 'TRAIN': self._anchor_target_layer( [*rpn_cls_score_level1.shape[2:5]] if cfg.NUM_ANCHORS_LEVEL1 != 0 else None, [*rpn_cls_score_level2.shape[2:5]] if cfg.NUM_ANCHORS_LEVEL2 != 0 else None, [*rpn_cls_score_level3.shape[2:5]] if cfg.NUM_ANCHORS_LEVEL3 != 0 else None) self._proposal_layer(rpn_cls_prob_level1 if cfg.NUM_ANCHORS_LEVEL1 != 0 else None, rpn_bbox_pred_level1 if cfg.NUM_ANCHORS_LEVEL1 !=0 else None, rpn_cls_prob_level2 if cfg.NUM_ANCHORS_LEVEL2 !=0 else None, rpn_bbox_pred_level2 if cfg.NUM_ANCHORS_LEVEL2 !=0 else None, rpn_cls_prob_level3 if cfg.NUM_ANCHORS_LEVEL3 !=0 else None, rpn_bbox_pred_level3 if cfg.NUM_ANCHORS_LEVEL3 !=0 else None)
def forward(self, x): #x: seq_len * batch_size * num if not self.training: seq_len = x.size()[0] return torch.stack([F.softmax(x[i], dim=1) for i in range(seq_len)], 0) else: return x
def tree_backup(self, tree_result, batch_size): backup_values = tree_result["values"][-1] for i in range(1, self.tree_depth + 1): one_step_backup = tree_result["rewards"][-i] + self.gamma*backup_values if i < self.tree_depth: one_step_backup = one_step_backup.view(batch_size, -1, self.num_actions) if self.value_aggregation == "max": max_backup = one_step_backup.max(2)[0] elif self.value_aggregation == "logsumexp": max_backup = logsumexp(one_step_backup, 2) elif self.value_aggregation == "softmax": max_backup = (one_step_backup * F.softmax(one_step_backup, dim=2)).sum(dim=2) else: raise ValueError("Unknown value aggregation function %s" % self.value_aggregation) backup_values = ((1 - self.td_lambda) * tree_result["values"][-i-1] + (self.td_lambda) * max_backup.view(-1, 1)) else: backup_values = one_step_backup backup_values = backup_values.view(batch_size, self.num_actions) return backup_values
def forward(self, hidden, encoder_outputs,encoder_lengths=None,return_weight=False): """ hidden : query (previous hidden) B,1,D <FloatTensor> encoder_outputs : context (encoder outputs) B,T,D <FloatTensor> encoder_lengths : list[int] """ q, c = hidden, encoder_outputs batch_size_q, n_q, dim_q = q.size() batch_size_c, n_c, dim_c = c.size() if not (batch_size_q == batch_size_c): msg = 'batch size mismatch (query: {}, context: {}, value: {})' raise ValueError(msg.format(q.size(), c.size())) batch_size = batch_size_q s = self.score(q,c) # 인코딩 마스킹 if encoder_lengths is not None: mask = s.data.new(batch_size, n_q, n_c) mask = self.fill_context_mask(mask, sizes=encoder_lengths, v_mask=float('-inf'), v_unmask=0) s = Variable(mask) + s # softmax로 normalize w = F.softmax(s,2) # B,1,T # Combine z = w.bmm(c) if return_weight: return w, z return z
def probs(self, generator, outputs, vocab_pointer_switches, context_question_switches, context_attention, question_attention, context_indices, question_indices, oov_to_limited_idx): size = list(outputs.size()) size[-1] = self.generative_vocab_size scores = generator(outputs.view(-1, outputs.size(-1))).view(size) p_vocab = F.softmax(scores, dim=scores.dim()-1) scaled_p_vocab = vocab_pointer_switches.expand_as(p_vocab) * p_vocab effective_vocab_size = self.generative_vocab_size + len(oov_to_limited_idx) if self.generative_vocab_size < effective_vocab_size: size[-1] = effective_vocab_size - self.generative_vocab_size buff = scaled_p_vocab.new_full(size, EPSILON) scaled_p_vocab = torch.cat([scaled_p_vocab, buff], dim=buff.dim()-1) # p_context_ptr scaled_p_vocab.scatter_add_(scaled_p_vocab.dim()-1, context_indices.unsqueeze(1).expand_as(context_attention), (context_question_switches * (1 - vocab_pointer_switches)).expand_as(context_attention) * context_attention) # p_question_ptr scaled_p_vocab.scatter_add_(scaled_p_vocab.dim()-1, question_indices.unsqueeze(1).expand_as(question_attention), ((1 - context_question_switches) * (1 - vocab_pointer_switches)).expand_as(question_attention) * question_attention) return scaled_p_vocab
def forward(self, xt, fc_feats, att_feats, p_att_feats, state): # The p_att_feats here is already projected att_size = att_feats.numel() // att_feats.size(0) // self.att_feat_size att = p_att_feats.view(-1, att_size, self.att_hid_size) att_h = self.h2att(state[0][-1]) # batch * att_hid_size att_h = att_h.unsqueeze(1).expand_as(att) # batch * att_size * att_hid_size dot = att + att_h # batch * att_size * att_hid_size dot = F.tanh(dot) # batch * att_size * att_hid_size dot = dot.view(-1, self.att_hid_size) # (batch * att_size) * att_hid_size dot = self.alpha_net(dot) # (batch * att_size) * 1 dot = dot.view(-1, att_size) # batch * att_size weight = F.softmax(dot) # batch * att_size att_feats_ = att_feats.view(-1, att_size, self.att_feat_size) # batch * att_size * att_feat_size att_res = torch.bmm(weight.unsqueeze(1), att_feats_).squeeze(1) # batch * att_feat_size all_input_sums = self.i2h(xt) + self.h2h(state[0][-1]) sigmoid_chunk = all_input_sums.narrow(1, 0, 3 * self.rnn_size) sigmoid_chunk = F.sigmoid(sigmoid_chunk) in_gate = sigmoid_chunk.narrow(1, 0, self.rnn_size) forget_gate = sigmoid_chunk.narrow(1, self.rnn_size, self.rnn_size) out_gate = sigmoid_chunk.narrow(1, self.rnn_size * 2, self.rnn_size) in_transform = all_input_sums.narrow(1, 3 * self.rnn_size, 2 * self.rnn_size) + \ self.a2c(att_res) in_transform = torch.max(\ in_transform.narrow(1, 0, self.rnn_size), in_transform.narrow(1, self.rnn_size, self.rnn_size)) next_c = forget_gate * state[1][-1] + in_gate * in_transform next_h = out_gate * F.tanh(next_c) output = self.dropout(next_h) state = (next_h.unsqueeze(0), next_c.unsqueeze(0)) return output, state
def iterate_batches(envs, net, device="cpu"): n_actions = envs[0].action_space.n act_selector = ptan.actions.ProbabilityActionSelector() obs = [e.reset() for e in envs] batch_dones = [[False] for _ in range(NUM_ENVS)] total_reward = [0.0] * NUM_ENVS total_steps = [0] * NUM_ENVS mb_obs = np.zeros((NUM_ENVS, REWARD_STEPS) + IMG_SHAPE, dtype=np.uint8) mb_rewards = np.zeros((NUM_ENVS, REWARD_STEPS), dtype=np.float32) mb_values = np.zeros((NUM_ENVS, REWARD_STEPS), dtype=np.float32) mb_actions = np.zeros((NUM_ENVS, REWARD_STEPS), dtype=np.int32) mb_probs = np.zeros((NUM_ENVS, REWARD_STEPS, n_actions), dtype=np.float32) while True: batch_dones = [[dones[-1]] for dones in batch_dones] done_rewards = [] done_steps = [] for n in range(REWARD_STEPS): obs_v = ptan.agent.default_states_preprocessor(obs).to(device) mb_obs[:, n] = obs_v.data.cpu().numpy() logits_v, values_v = net(obs_v) probs_v = F.softmax(logits_v, dim=1) probs = probs_v.data.cpu().numpy() actions = act_selector(probs) mb_probs[:, n] = probs mb_actions[:, n] = actions mb_values[:, n] = values_v.squeeze().data.cpu().numpy() for e_idx, e in enumerate(envs): o, r, done, _ = e.step(actions[e_idx]) total_reward[e_idx] += r total_steps[e_idx] += 1 if done: o = e.reset() done_rewards.append(total_reward[e_idx]) done_steps.append(total_steps[e_idx]) total_reward[e_idx] = 0.0 total_steps[e_idx] = 0 obs[e_idx] = o mb_rewards[e_idx, n] = r batch_dones[e_idx].append(done) # obtain values for the last observation obs_v = ptan.agent.default_states_preprocessor(obs).to(device) _, values_v = net(obs_v) values_last = values_v.squeeze().data.cpu().numpy() for e_idx, (rewards, dones, value) in enumerate(zip(mb_rewards, batch_dones, values_last)): rewards = rewards.tolist() if not dones[-1]: rewards = discount_with_dones(rewards + [value], dones[1:] + [False], GAMMA)[:-1] else: rewards = discount_with_dones(rewards, dones[1:], GAMMA) mb_rewards[e_idx] = rewards out_mb_obs = mb_obs.reshape((-1,) + IMG_SHAPE) out_mb_rewards = mb_rewards.flatten() out_mb_actions = mb_actions.flatten() out_mb_values = mb_values.flatten() out_mb_probs = mb_probs.flatten() yield out_mb_obs, out_mb_rewards, out_mb_actions, out_mb_values, out_mb_probs, \ np.array(done_rewards), np.array(done_steps)
def forward(self, x, y, y_mask): """Input shapes: x = batch * len1 * h y = batch * len2 * h y_mask = batch * len2 Output shapes: matched_seq = batch * len1 * h """ # Project vectors if self.linear: x_proj = self.linear(x.view(-1, x.size(2))).view(x.size()) x_proj = F.relu(x_proj) y_proj = self.linear(y.view(-1, y.size(2))).view(y.size()) y_proj = F.relu(y_proj) else: x_proj = x y_proj = y # Compute scores scores = x_proj.bmm(y_proj.transpose(2, 1)) # Mask padding y_mask = y_mask.unsqueeze(1).expand(scores.size()) scores.data.masked_fill_(y_mask.data, -float('inf')) # Normalize with softmax alpha_flat = F.softmax(scores.view(-1, y.size(1))) alpha = alpha_flat.view(-1, x.size(1), y.size(1)) # Take weighted average matched_seq = alpha.bmm(y) return matched_seq
def probs(self, generator, outputs, vocab_pointer_switches, context_question_switches, context_attention, question_attention, context_indices, question_indices, oov_to_limited_idx): size = list(outputs.size()) size[-1] = self.generative_vocab_size scores = generator(outputs.view(-1, outputs.size(-1))).view(size) p_vocab = F.softmax(scores, dim=scores.dim()-1) scaled_p_vocab = vocab_pointer_switches.expand_as(p_vocab) * p_vocab effective_vocab_size = self.generative_vocab_size + len(oov_to_limited_idx) if self.generative_vocab_size < effective_vocab_size: size[-1] = effective_vocab_size - self.generative_vocab_size buff = Variable(scaled_p_vocab.data.new(*size).fill_(EPSILON)) scaled_p_vocab = torch.cat([scaled_p_vocab, buff], dim=buff.dim()-1) p_context_ptr = Variable(scaled_p_vocab.data.new(*scaled_p_vocab.size()).fill_(EPSILON)) p_context_ptr.scatter_add_(p_context_ptr.dim()-1, context_indices.unsqueeze(1).expand_as(context_attention), context_attention) scaled_p_context_ptr = (context_question_switches * (1 - vocab_pointer_switches)).expand_as(p_context_ptr) * p_context_ptr p_question_ptr = Variable(scaled_p_vocab.data.new(*scaled_p_vocab.size()).fill_(EPSILON)) p_question_ptr.scatter_add_(p_question_ptr.dim()-1, question_indices.unsqueeze(1).expand_as(question_attention), question_attention) scaled_p_question_ptr = ((1 - context_question_switches) * (1 - vocab_pointer_switches)).expand_as(p_question_ptr) * p_question_ptr probs = scaled_p_vocab + scaled_p_context_ptr + scaled_p_question_ptr return probs
def test(model, device, test_loader): model.to(device) model.eval() test_loss = 0 correct = 0 with torch.no_grad(): y_pred = [] y_true = [] for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) output = torch.mean(output.view(output.size(0), output.size(1), -1), dim=2) test_loss += F.cross_entropy(output, target) output = F.softmax(output, dim=1) confidence, pred = output.max(1) print('confidence: {}, prediction: {}, ground truth: {}'.format(confidence.cpu().numpy(), pred.cpu().numpy(), target.cpu().numpy())) y_pred += pred.data.tolist() y_true += target.data.tolist() correct += pred.eq(target.view_as(pred)).sum().item() print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) print(metrics.classification_report(np.asarray(y_true), np.asarray(y_pred))) print('confusion matrix: \n', metrics.confusion_matrix(np.asarray(y_true), np.asarray(y_pred))) print('\n')
def train_a2c(net, mb_obs, mb_rewards, mb_actions, mb_values, optimizer, tb_tracker, step_idx, device="cpu"): optimizer.zero_grad() mb_adv = mb_rewards - mb_values adv_v = torch.FloatTensor(mb_adv).to(device) obs_v = torch.FloatTensor(mb_obs).to(device) rewards_v = torch.FloatTensor(mb_rewards).to(device) actions_t = torch.LongTensor(mb_actions).to(device) logits_v, values_v = net(obs_v) log_prob_v = F.log_softmax(logits_v, dim=1) log_prob_actions_v = adv_v * log_prob_v[range(len(mb_actions)), actions_t] loss_policy_v = -log_prob_actions_v.mean() loss_value_v = F.mse_loss(values_v.squeeze(-1), rewards_v) prob_v = F.softmax(logits_v, dim=1) entropy_loss_v = (prob_v * log_prob_v).sum(dim=1).mean() loss_v = ENTROPY_BETA * entropy_loss_v + VALUE_LOSS_COEF * loss_value_v + loss_policy_v loss_v.backward() nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD) optimizer.step() tb_tracker.track("advantage", mb_adv, step_idx) tb_tracker.track("values", values_v, step_idx) tb_tracker.track("batch_rewards", rewards_v, step_idx) tb_tracker.track("loss_entropy", entropy_loss_v, step_idx) tb_tracker.track("loss_policy", loss_policy_v, step_idx) tb_tracker.track("loss_value", loss_value_v, step_idx) tb_tracker.track("loss_total", loss_v, step_idx) return obs_v
def forward(self, x): x = self.features(x) a = self.conv6_1(x) b = self.conv6_2(x) c = self.conv6_3(x) a = F.softmax(a, dim=1) return c, b, a
def routing(self, x, b_IJ, W,batch_size,routing_iter): x1 = x.view(batch_size, 256, 1, 6, 6) x_tile = x1.repeat(1, 1, 10, 1, 1) x_view = x_tile.view(batch_size, 1152, 10, 8, 1) stride_i = W.repeat(batch_size, 1, 1, 1, 1) stride_j = stride_i.view(batch_size, 1152, 10, 16, 8) dot_op = torch.matmul(stride_j, x_view) dot_op_stopped = Variable(dot_op.data.clone(), requires_grad=False) for r_iter in range(routing_iter): id_capsule = F.softmax(b_IJ, dim=2) if r_iter == routing_iter - 1: route_I = torch.mul(id_capsule, dot_op) route_I_sum = torch.sum(route_I, dim=1, keepdim=True) + self.bias V_J = squash(route_I_sum,self.epsilon) if r_iter < routing_iter - 1: dot_op_stopped_tmp = dot_op_stopped.data.numpy() dot_op_stopped_tmp = np.reshape(dot_op_stopped_tmp, (batch_size, 1152, 10, 16, 1)) id_capsule_tmp = id_capsule.data.numpy() route_I_tmp = id_capsule_tmp * dot_op_stopped_tmp route_I_tmp_sum = np.sum(route_I_tmp, axis=1, keepdims=True) + self.bias.data.numpy() V_J_tmp = squash(torch.Tensor(route_I_tmp_sum),self.epsilon) V_J_tmp_tiled = np.tile(V_J_tmp.numpy(), (1, 1152, 1, 1, 1)) dot_op_stopped_tmp = np.reshape(dot_op_stopped_tmp, (batch_size, 1152, 10, 1, 16)) u_produce_v = np.matmul(dot_op_stopped_tmp, V_J_tmp_tiled) b_IJ.data += torch.Tensor(u_produce_v) return V_J
def loss(anchors, data, pred, threshold): iou = pred['iou'] device_id = iou.get_device() if torch.cuda.is_available() else None rows, cols = pred['feature'].size()[-2:] iou_matrix, _iou, _, _data = iou_match(pred['yx_min'].data, pred['yx_max'].data, data) anchors = utils.ensure_device(anchors, device_id) positive = fit_positive(rows, cols, *(data[key] for key in 'yx_min, yx_max'.split(', ')), anchors) negative = ~positive & (_iou < threshold) _center_offset, _size_norm = fill_norm(*(_data[key] for key in 'yx_min, yx_max'.split(', ')), anchors) positive, negative, _iou, _center_offset, _size_norm, _cls = (torch.autograd.Variable(t) for t in (positive, negative, _iou, _center_offset, _size_norm, _data['cls'])) _positive = torch.unsqueeze(positive, -1) loss = {} # iou loss['foreground'] = F.mse_loss(iou[positive], _iou[positive], size_average=False) loss['background'] = torch.sum(square(iou[negative])) # bbox loss['center'] = F.mse_loss(pred['center_offset'][_positive], _center_offset[_positive], size_average=False) loss['size'] = F.mse_loss(pred['size_norm'][_positive], _size_norm[_positive], size_average=False) # cls if 'logits' in pred: logits = pred['logits'] if len(_cls.size()) > 3: loss['cls'] = F.mse_loss(F.softmax(logits, -1)[_positive], _cls[_positive], size_average=False) else: loss['cls'] = F.cross_entropy(logits[_positive].view(-1, logits.size(-1)), _cls[positive].view(-1)) # normalize cnt = float(np.multiply.reduce(positive.size())) for key in loss: loss[key] /= cnt return loss, dict(iou=_iou, data=_data, positive=positive, negative=negative)
def forward(self, image_feat, question_embedding): att1 = self.att1.compute_raw_att(image_feat, question_embedding) att2 = self.att2.compute_raw_att(image_feat, question_embedding) raw_attention = att1 + att2 # softmax across locations attention = F.softmax(raw_attention, dim=1).expand_as(image_feat) return attention
def calc_loss(batch, net, tgt_net, gamma, device="cpu", save_prefix=None): states, actions, rewards, dones, next_states = common.unpack_batch(batch) batch_size = len(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) next_states_v = torch.tensor(next_states).to(device) # next state distribution next_distr_v, next_qvals_v = tgt_net.both(next_states_v) next_actions = next_qvals_v.max(1)[1].data.cpu().numpy() next_distr = tgt_net.apply_softmax(next_distr_v).data.cpu().numpy() next_best_distr = next_distr[range(batch_size), next_actions] dones = dones.astype(np.bool) # project our distribution using Bellman update proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) # calculate net output distr_v = net(states_v) state_action_values = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(state_action_values, dim=1) proj_distr_v = torch.tensor(proj_distr).to(device) if save_prefix is not None: pred = F.softmax(state_action_values, dim=1).data.cpu().numpy() save_transition_images(batch_size, pred, proj_distr, next_best_distr, dones, rewards, save_prefix) loss_v = -state_log_sm_v * proj_distr_v return loss_v.sum(dim=1).mean()
def predict(self, x, attn_type = "hard"): #predict with greedy decoding emb = self.embedding(x) h = Variable(torch.zeros(1, x.size(0), self.hidden_dim)) c = Variable(torch.zeros(1, x.size(0), self.hidden_dim)) enc_h, _ = self.encoder(emb, (h, c)) y = [Variable(torch.zeros(x.size(0)).long())] self.attn = [] for t in range(x.size(1)): emb_t = self.embedding(y[-1]) dec_h, (h, c) = self.decoder(emb_t.unsqueeze(1), (h, c)) scores = torch.bmm(enc_h, dec_h.transpose(1,2)).squeeze(2) attn_dist = F.softmax(scores, dim = 1) self.attn.append(attn_dist.data) if attn_type == "hard": _, argmax = attn_dist.max(1) one_hot = Variable(torch.zeros_like(attn_dist.data).scatter_(-1, argmax.data.unsqueeze(1), 1)) context = torch.bmm(one_hot.unsqueeze(1), enc_h).squeeze(1) else: context = torch.bmm(attn_dist.unsqueeze(1), enc_h).squeeze(1) pred = self.vocab_layer(torch.cat([dec_h.squeeze(1), context], 1)) _, next_token = pred.max(1) y.append(next_token) self.attn = torch.stack(self.attn, 0).transpose(0, 1) return torch.stack(y, 0).transpose(0, 1)
def action_probs(self, x): x = self(x) # log_probs = F.log_softmax(x) probs = F.softmax(x) return probs
def sample(dataloader): for batch in dataloader: output_seq = Variable(batch['output_seq']) del (batch['output_seq']) for k in batch: batch[k] = Variable(batch[k]) if DEVICE_NO != -1: output_seq = output_seq.cuda(DEVICE_NO) for k in batch: batch[k] = batch[k].cuda(DEVICE_NO) pred = uf.forward(**batch) pred = F.softmax(pred, dim=-1) prob, label = torch.max(pred, dim=-1) for i in range(len(list(batch.values())[0])): out_seq = [] for j in range(int(batch['sent_len'][i])): word = idx2word[int(batch['word_seq'][i, int(j)])] pos = idx2pos[int(batch['pos_seq'][i, int(j)])] l_true = idx2label[int(output_seq[i, int(j)])] p = float(prob[i, int(j)]) l = idx2label[int(label[i, int(j)])] if p > PROB_THRESH else 'O' out_seq.append([word, pos, l, l_true]) out_seq = change_seq_format(out_seq) for item in out_seq: print('{}/{}/{}'.format(item[0], item[3], item[2]), end=' ') print('') input('input to continue:')
def forward_dot(self, hid, ctx, ctx_mask): r"""Computes Luong-style dot attention probabilities between decoder's hidden state and source annotations. Arguments: hid(Variable): A set of decoder hidden states of shape `T*B*H` where `T` == 1, `B` is batch dim and `H` is hidden state dim. ctx(Variable): A set of annotations of shape `S*B*C` where `S` is the source timestep dim, `B` is batch dim and `C` is annotation dim. ctx_mask(FloatTensor): A binary mask of shape `S*B` with zeroes in the padded timesteps. Returns: scores(Variable): A variable of shape `S*B` containing normalized attention scores for each position and sample. z_t(Variable): A variable of shape `B*H` containing the final attended context vector for this target decoding timestep. """ # Apply transformations first to make last dims both C and then # shuffle dims to prepare for batch mat-mult ctx_ = self.ctx2ctx(ctx).permute(1, 2, 0) # S*B*C -> S*B*C -> B*C*S hid_ = self.hid2ctx(hid).permute(1, 0, 2) # T*B*H -> T*B*C -> B*T*C # 'dot' scores of B*T*S scores = F.softmax(torch.bmm(hid_, ctx_), dim=-1) # Transform back to hidden_dim for further decoders # B*T*S x B*S*C -> B*T*C -> B*T*H z_t = self.ctx2hid(torch.bmm(scores, ctx.transpose(0, 1))) return scores.transpose(0, 1), z_t.transpose(0, 1)
def forward(self, images, questions): N, T, _, _, _ = images.size() # bs x 5 x 3 x 224 x 224 img_feats = self.cnn(images.contiguous().view( -1, images.size(2), images.size(3), images.size(4))) img_feats = self.cnn_fc_layer(img_feats) img_feats_tr = self.img_tr(img_feats) ques_feats = self.q_rnn(questions) ques_feats_repl = ques_feats.view(N, 1, -1).repeat(1, T, 1) ques_feats_repl = ques_feats_repl.view(N * T, -1) ques_feats_tr = self.ques_tr(ques_feats_repl) ques_img_feats = torch.cat([ques_feats_tr, img_feats_tr], 1) att_feats = self.att(ques_img_feats) att_probs = F.softmax(att_feats.view(N, T), dim=1) att_probs2 = att_probs.view(N, T, 1).repeat(1, 1, 64) att_img_feats = torch.mul(att_probs2, img_feats.view(N, T, 64)) att_img_feats = torch.sum(att_img_feats, dim=1) mul_feats = torch.mul(ques_feats, att_img_feats) scores = self.classifier(mul_feats) return scores, att_probs
def test(dataloader, out=sys.stdout): for batch in dataloader: if 'output_seq' in batch: del batch['output_seq'] for k in batch: batch[k] = Variable(batch[k]) if DEVICE_NO != -1: for k in batch: batch[k] = batch[k].cuda(DEVICE_NO) pred = uf.forward(**batch) pred = F.softmax(pred, dim=-1) prob, label = torch.max(pred, dim=-1) for i in range(len(list(batch.values())[0])): out_seq = [] for j in range(int(batch['sent_len'][i])): word = idx2word[int(batch['word_seq'][i, int(j)])] pos = idx2pos[int(batch['pos_seq'][i, int(j)])] p = float(prob[i, int(j)]) l = idx2label[int(label[i, int(j)])] if p > PROB_THRESH else 'O' out_seq.append([word, pos, l]) out_seq = change_seq_format(out_seq) # out.write('{}/{}/{} '.format(word, pos, l)) for item in out_seq: out.write('{}/{}/{} '.format(item[0], item[1], item[2])) out.write('\n')
def forward(self, sequence, graph): """ Apply self-attention to the sequence, ignores the graph """ sequence = sequence.squeeze(1) #get the dimension n, d = sequence.size() #project the sequence into key, value, and query sequences keySeq = f.relu(self.keyProj(sequence)) valueSeq = f.relu(self.valueProj(sequence)) querySeq = f.relu(self.queryProj(sequence)) #combine query with each key #a_ijh = softmax( (q_ih^T k_jh) / sqrt(d) ) #the result is, row i is the importance of the sequence for key i importance = f.softmax(t.matmul(querySeq, keySeq.permute(1,0)) * math.sqrt(d),0).permute(1,0) #apply the importance weights to the value sequence attention = t.matmul(valueSeq.permute(1,0), importance).permute(1,0) #sum the sequence for a complete representation final = t.sum(attention, 0) return attention.unsqueeze(1), final
def __call__(self): image_bgr = self.get_image() tensor = self.conv_tensor(image_bgr) pred = pybenchmark.profile('inference')(model._inference)(self.inference, torch.autograd.Variable(tensor, volatile=True)) rows, cols = pred['feature'].size()[-2:] iou = pred['iou'].data.contiguous().view(-1) yx_min, yx_max = (pred[key].data.view(-1, 2) for key in 'yx_min, yx_max'.split(', ')) logits = get_logits(pred) prob = F.softmax(logits, -1).data.view(-1, logits.size(-1)) ret = postprocess(self.config, iou, yx_min, yx_max, prob) image_result = image_bgr.copy() if ret is not None: iou, yx_min, yx_max, cls, score = ret try: scale = self.scale except AttributeError: scale = utils.ensure_device(torch.from_numpy(np.array(image_result.shape[:2], np.float32) / np.array([rows, cols], np.float32))) self.scale = scale yx_min, yx_max = ((t * scale).cpu().numpy().astype(np.int) for t in (yx_min, yx_max)) image_result = self.draw_bbox(image_result, yx_min, yx_max, cls) cv2.imshow('detection', image_result) if self.args.output: self.writer.write(image_result) if cv2.waitKey(0 if self.args.pause else 1) in self.keys: root = os.path.join(self.model_dir, 'snapshot') os.makedirs(root, exist_ok=True) path = os.path.join(root, time.strftime(self.args.format)) cv2.imwrite(path, image_bgr) logging.warning('image dumped into ' + path)
def postprocess_detections(self, class_logits, sub_cls_logits, box_regression, proposals, image_shapes): # type: (Tensor, Tensor, List[Tensor], List[Tuple[int, int]]) device = class_logits.device num_classes = class_logits.shape[-1] num_sub_cls = 17 boxes_per_image = [len(boxes_in_image) for boxes_in_image in proposals] pred_boxes = self.box_coder.decode(box_regression, proposals) pred_scores = F.softmax(class_logits, -1) pred_sub_scores = torch.sigmoid(sub_cls_logits) # split boxes and scores per image if len(boxes_per_image) == 1: # TODO : remove this when ONNX support dynamic split sizes # and just assign to pred_boxes instead of pred_boxes_list pred_boxes_list = [pred_boxes] pred_scores_list = [pred_scores] pred_sub_list = [pred_sub_scores] else: pred_boxes_list = pred_boxes.split(boxes_per_image, 0) pred_scores_list = pred_scores.split(boxes_per_image, 0) pred_sub_list = pred_sub_scores.split(boxes_per_image, 0) all_boxes = [] all_scores = [] all_labels = [] all_subs = [] for boxes, scores,sub_scores, image_shape in zip(pred_boxes_list, pred_scores_list,pred_sub_list, image_shapes): boxes = box_ops.clip_boxes_to_image(boxes, image_shape) # create labels for each prediction labels = torch.arange(num_classes, device=device) labels = labels.view(1, -1).expand_as(scores) sub_labels = torch.arange(num_sub_cls, device=device) sub_labels = sub_labels.view(1, -1).expand_as(sub_scores) * (sub_scores > 0.5) sub_labels = torch.repeat_interleave(sub_labels, num_classes, dim=0) # remove predictions with the background label boxes = boxes[:, 1:] scores = scores[:, 1:] labels = labels[:, 1:] # batch everything, by making every class prediction be a separate instance boxes = boxes.reshape(-1, 4) scores = scores.reshape(-1) labels = labels.reshape(-1) sub_labels = sub_labels.reshape(-1, 17) # remove low scoring boxes inds = torch.nonzero(scores > self.score_thresh).squeeze(1) boxes, scores, labels, sub_labels= boxes[inds], scores[inds], labels[inds], sub_labels[inds] # remove empty boxes keep = box_ops.remove_small_boxes(boxes, min_size=1e-2) boxes, scores, labels, sub_labels = boxes[keep], scores[keep], labels[keep], sub_labels[keep] # non-maximum suppression, independently done per class keep = box_ops.batched_nms(boxes, scores, labels, self.nms_thresh) # keep only topk scoring predictions keep = keep[:self.detections_per_img] boxes, scores, labels, sub_labels= boxes[keep], scores[keep], labels[keep], sub_labels[keep] all_boxes.append(boxes) all_scores.append(scores) all_labels.append(labels) all_subs.append(sub_labels) return all_boxes, all_scores, all_labels, all_subs
def _forward_loop(self, state: Dict[str, torch.Tensor], target_tokens: Dict[str, torch.LongTensor] = None) -> Dict[str, torch.Tensor]: """ Make forward pass during training or do greedy search during prediction. Notes ----- We really only use the predictions from the method to test that beam search with a beam size of 1 gives the same results. """ # shape: (batch_size, max_input_sequence_length) source_mask = state["source_mask"] batch_size = source_mask.size()[0] if target_tokens: # shape: (batch_size, max_target_sequence_length) targets = target_tokens["tokens"] _, target_sequence_length = targets.size() # The last input from the target is either padding or the end symbol. # Either way, we don't have to process it. num_decoding_steps = target_sequence_length - 1 else: num_decoding_steps = self._max_decoding_steps # Initialize target predictions with the start index. # shape: (batch_size,) last_predictions = source_mask.new_full((batch_size,), fill_value=self._start_index) step_logits: List[torch.Tensor] = [] step_predictions: List[torch.Tensor] = [] step_attn_weights: List[torch.Tensor] = [] for timestep in range(num_decoding_steps): if self.training and torch.rand(1).item() < self._scheduled_sampling_ratio: # Use gold tokens at test time and at a rate of 1 - _scheduled_sampling_ratio # during training. # shape: (batch_size,) input_choices = last_predictions elif not target_tokens: # shape: (batch_size,) input_choices = last_predictions else: # shape: (batch_size,) input_choices = targets[:, timestep] # shape: (batch_size, num_classes) # shape: (batch_size, input_max_size) input_weights, output_projections, state = self._prepare_output_projections(input_choices, state) step_attn_weights.append(input_weights.unsqueeze(1)) # list of tensors, shape: (batch_size, 1, num_classes) step_logits.append(output_projections.unsqueeze(1)) # shape: (batch_size, num_classes) class_probabilities = F.softmax(output_projections, dim=-1) # shape (predicted_classes): (batch_size,) _, predicted_classes = torch.max(class_probabilities, 1) # shape (predicted_classes): (batch_size,) last_predictions = predicted_classes step_predictions.append(last_predictions.unsqueeze(1)) # shape: (batch_size, num_decoding_steps) predictions = torch.cat(step_predictions, 1) # shape: (batch_size, num_decoding_steps, max_input_sequence_length) attention_input_weights = torch.cat(step_attn_weights, 1) output_dict = {"predictions": predictions, 'attention_input_weights': attention_input_weights} if target_tokens: # shape: (batch_size, num_decoding_steps, num_classes) logits = torch.cat(step_logits, 1) # shape: (batch_size, num_decoding_steps, max_input_sequence_length) attn_weights = torch.cat(step_attn_weights, 1) # Compute loss. target_mask = util.get_text_field_mask(target_tokens) loss = self._get_loss(logits, targets, target_mask) coverage_loss = self._get_coverage_loss(attn_weights, source_mask, target_mask) assert coverage_loss < 1 self._coverage_loss(coverage_loss.detach().cpu().item()) output_dict["loss"] = loss + self._coverage_lambda * coverage_loss return output_dict
def main(args): # Build data loader if not os.path.isdir(args.model_path): os.makedirs(args.model_path) data_loader,ds_class = get_loader(args.data_dir, args.seq_len, args.batch_size, shuffle=True, num_workers=args.num_workers, ds = args.ds) # Build eval data loader eval_data_loader,_ = get_loader(args.data_dir_test, args.seq_len, args.batch_size, shuffle=True, num_workers=args.num_workers, ds = args.ds, lbl2id = ds_class.lbl2id) model = SkeletonAction(args.input_size, args.hidden_size, args.num_class, args.num_layers, args.use_bias, args.dropout) # Loss and Optimizer criterion = nn.CrossEntropyLoss() if torch.cuda.is_available(): model.cuda() criterion = criterion.cuda() params = model.parameters() optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Load the trained model parameters # Now, we try to find the latest encoder and decoder model. if os.path.isdir(args.model_path) and os.listdir(args.model_path): m_fn = max(glob.glob(os.path.join(args.model_path, 'model*')), key = os.path.getctime) if m_fn: model.load_state_dict(torch.load(m_fn)) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): total_train = 0 total_correct = 0 total_train_2 = 0 total_correct_2 = 0 for i_step, (lbl, data, length) in enumerate(data_loader): # Set mini-batch dataset lbl = Variable(lbl) data = Variable(data) mask = torch.zeros(data.size(0), data.size(1)) for i,m in zip(length, mask): m[0:i[0]] = 1 mask = Variable(mask) if torch.cuda.is_available(): lbl = lbl.cuda() data = data.cuda() mask = mask.cuda() model.zero_grad() opt = model(data) # compute accuracy. pred_lbl = opt.max(dim = -1)[1].data.cpu() gt_lbl = lbl.data.cpu() cnt = torch.LongTensor(lbl.size(0), args.num_class).zero_() for i in range(pred_lbl.size(0)): for j in range(length[i][0]): cnt[i][pred_lbl[i,j]] += 1 cnt_lbl = cnt.max(dim = -1)[1] total_train += data.size(0) total_correct += (cnt_lbl.squeeze() == gt_lbl.squeeze()).sum() prob = F.softmax(opt.view(opt.size(0) * opt.size(1), opt.size(2))) prob = prob.view(opt.size(0), opt.size(1), opt.size(2)) prob = prob.sum(dim = 1) pred_lbl = prob.max(dim = -1)[1].data.cpu() total_correct_2 += (pred_lbl.squeeze() == gt_lbl.squeeze()).sum() lbl = lbl.squeeze().unsqueeze(1) lbl = lbl.repeat(1, opt.size(1)).contiguous() lbl = lbl.view(lbl.size(0) * lbl.size(1)) opt = opt.contiguous() opt = opt.view(opt.size(0) * opt.size(1), opt.size(2)) log_p = F.log_softmax(opt) loss = - (mask.squeeze() * log_p[torch.LongTensor(range(opt.size(0))).cuda(), lbl.squeeze().data]).sum() / mask.sum() local_acc = (cnt_lbl.squeeze() == gt_lbl.squeeze()).sum() * 1.0 / data.size(0) local_acc2 = (pred_lbl.squeeze() == gt_lbl.squeeze()).sum() * 1.0 / data.size(0) if i_step % args.log_step == 0: logging.info('Epoch [%d/%d], [%d/%d], Loss: %.4f, accuracy: %5.4f, accuracy2: %5.4f', epoch, args.num_epochs, i_step, len(data_loader), loss.data[0], local_acc, local_acc2) #loss = criterion(opt, lbl) loss.backward() optimizer.step() # Eval the trained model if i_step % args.eval_step == 0: model.eval() total_num = 0 correct_num = 0 correct_num2 = 0 for k_step, (lbl, data, length) in enumerate(eval_data_loader): lbl = Variable(lbl) data = Variable(data) mask = torch.zeros(data.size(0), data.size(1)) for i,m in zip(length, mask): m[0:i[0]] = 1 if torch.cuda.is_available(): lbl = lbl.cuda() data = data.cuda() mask = mask.cuda() mask = Variable(mask) model.zero_grad() opt = model(data) pred_lbl = opt.max(dim = -1)[1].data.cpu() gt_lbl = lbl.data.cpu() cnt = torch.LongTensor(lbl.size(0), args.num_class).zero_() for i in range(pred_lbl.size(0)): for j in range(length[i][0]): cnt[i][pred_lbl[i,j]] += 1 cnt = cnt.max(dim = -1)[1] total_num += data.size(0) correct_num += (cnt.squeeze() == gt_lbl.squeeze()).sum() prob = F.softmax(opt.view(opt.size(0) * opt.size(1), opt.size(2))) prob = prob.view(opt.size(0), opt.size(1), opt.size(2)) prob = prob.sum(dim = 1) pred_lbl = prob.max(dim = -1)[1].data.cpu() correct_num2 += (pred_lbl.squeeze() == gt_lbl.squeeze()).sum() lbl = lbl.squeeze().unsqueeze(1) lbl = lbl.repeat(1, opt.size(1)).contiguous() lbl = lbl.view(lbl.size(0) * lbl.size(1)) opt = opt.contiguous() opt = opt.view(opt.size(0) * opt.size(1), opt.size(2)) #loss = criterion(opt, lbl) log_p = F.log_softmax(opt) loss = - (mask.squeeze() * log_p[torch.LongTensor(range(opt.size(0))).cuda(), lbl.squeeze().data]).sum() / mask.sum() accuracy = correct_num * 1.0 / total_num accuracy2 = correct_num2 * 1.0 / total_num logging.info('Validating [%d], Loss: %.4f, accuracy: %.4f, accuracy2 = %.4f' ,epoch, loss.data[0], accuracy, accuracy2) model.train() if epoch % 10 == 0: logging.info('Epoch [%d/%d], Loss: %.4f, accuracy: %5.4f' ,epoch, args.num_epochs, loss.data[0], accuracy) # Save the models torch.save(model.state_dict(), os.path.join(args.model_path, 'model-%d.pkl' %(epoch+1)))
def forward(self, up, down): refimg_fea = self.feature_extraction(up) # reference image feature targetimg_fea = self.feature_extraction(down) # target image feature #matching cost = Variable( torch.FloatTensor(refimg_fea.shape[0], refimg_fea.shape[1] * 2, self.maxdisp // 4 * 3, refimg_fea.shape[2], refimg_fea.shape[3]).zero_()).cuda() for i in range(self.maxdisp // 4 * 3): if i > 0: cost[:, :refimg_fea.size()[1], i, :, :] = refimg_fea[:, :, :, :] cost[:, refimg_fea.size()[1]:, i, :, :] = shift_down[:, :, :, :] shift_down = self.forF(shift_down) else: cost[:, :refimg_fea.size()[1], i, :, :] = refimg_fea cost[:, refimg_fea.size()[1]:, i, :, :] = targetimg_fea shift_down = self.forF(targetimg_fea) cost = cost.contiguous() cost0 = self.dres0(cost) cost0 = self.dres1(cost0) + cost0 out1, pre1, post1 = self.dres2(cost0, None, None) out1 = out1 + cost0 out2, pre2, post2 = self.dres3(out1, pre1, post1) out2 = out2 + cost0 out3, pre3, post3 = self.dres4(out2, pre1, post2) out3 = out3 + cost0 cost1 = self.classif1(out1) cost2 = self.classif2(out2) + cost1 cost3 = self.classif3(out3) + cost2 cost1 = F.upsample( cost1, [self.maxdisp * 3, up.size()[2], up.size()[3]], mode='trilinear' ) # when within units, the maxdisp needs to be modified cost2 = F.upsample( cost2, [self.maxdisp * 3, up.size()[2], up.size()[3]], mode='trilinear') cost1 = torch.squeeze(cost1, 1) pred1 = F.softmax(cost1, dim=1) pred1 = disparityregression_sub3(self.maxdisp)(pred1) cost2 = torch.squeeze(cost2, 1) pred2 = F.softmax(cost2, dim=1) pred2 = disparityregression_sub3(self.maxdisp)(pred2) cost3 = F.upsample( cost3, [self.maxdisp * 3, up.size()[2], up.size()[3]], mode='trilinear') cost3 = torch.squeeze(cost3, 1) pred3 = F.softmax(cost3, dim=1) pred3 = disparityregression_sub3(self.maxdisp)(pred3) return pred1, pred2, pred3
def sample_sequence_beam(model, length, args, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda', sample=True, beam_size = 5, tokenizer= None, max_len = -1, min_len = -1): ''' Use beam search to sample a sequence conditioned on the given context ''' if start_token is None: assert context is not None, 'Specify exactly one of start_token and context!' context = torch.tensor(context, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1) else: assert context is None, 'Specify exactly one of start_token and context!' context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long) past = None # if specified, limit max generation length to input sentence length if args.max_len_inp: length = min([length, context.numel() + 1]) # +1 for endchar # full_beam = [] candidates = [{'prev':torch.tensor(context), 'output':torch.tensor(context),'past':None,'ended':False, 'score':0}] done_list = [] with torch.no_grad(): for i in trange(length): # the beam for the ith place candidates_sorted = sorted(candidates, key=lambda v: v['score']) k_best = candidates_sorted[:beam_size] # get the best ones candidates = [] for cand in k_best: past = None#cand['past'] prev = torch.tensor(cand['output'])#cand['prev'] output_0 = torch.tensor(cand['output']) logits, past = model(prev, past=past) logits = logits[:, -1, :] / temperature logits = top_k_logits(logits, k=top_k) log_probs = F.softmax(logits, dim=-1) vals, prev = torch.topk(log_probs, k=beam_size, dim=-1) vals = vals.view(beam_size) for j in range(prev.numel()): #for each candidate expansion output = torch.cat((torch.tensor(output_0), torch.tensor(prev[:,j].view(1,1))), dim=1) str_out = tokenizer.decode(output[0,:].tolist()) score = cand['score'] - vals[j].item() done = args.end_tok in str_out #'<|endoftext|>' in str_out cand_out = {'prev':prev, 'output':output.data,'past':past, 'score':score,'str_out':str_out} # if contains end token or reached end length, dump into done if done or i == length - 1: # put in the done pile done_list += [cand_out] else: # put in the beam candidates += [cand_out] if max_len != -1: # if we have specified a max length tmp_done_list = [] for d in done_list: # remove '<|endoftext|>' if applicable str_out = tokenizer.decode(d['output'][0,:].tolist()) trimmed_ind = str_out.find(args.end_tok)#('<|endoftext|>') if trimmed_ind == -1: trimmed_ind = len(str_out) str_out = str_out[:trimmed_ind] tok_len = len(tokenizer.encode(str_out)) if tok_len <= max_len + 1: print('encoded {}'.format(tokenizer.encode(str_out))) tmp_done_list += [d] done_list = tmp_done_list if min_len != -1: # trim all of the out strings so min_len is meaningful done_list_tmp = [] for d in done_list: str_out = tokenizer.decode(d['output'][0,:].tolist()) str_out = trim_text(str_out, args.end_tok, 10000) # remove end_token if you need to if len(tokenizer.encode(str_out)) >= context.numel() + min_len: done_list_tmp += [d] done_list = done_list_tmp output = max(done_list, key = lambda v: v['score'])['output'] return output
def generate(self, src_enc, src_len, tgt_lang_id, max_len=200, sample_temperature=None): """ Decode a sentence given initial start. `x`: - LongTensor(bs, slen) <EOS> W1 W2 W3 <EOS> <PAD> <EOS> W1 W2 W3 W4 <EOS> `lengths`: - LongTensor(bs) [5, 6] `positions`: - False, for regular "arange" positions (LM) - True, to reset positions from the new generation (MT) `langs`: - must be None if the model only supports one language - lang_id if only one language is involved (LM) - (lang_id1, lang_id2) if two languages are involved (MT) """ # input batch bs = len(src_len) assert src_enc.size(0) == bs # generated sentences generated = src_len.new(max_len, bs) # upcoming output generated.fill_(self.pad_index) # fill upcoming ouput with <PAD> generated[0].fill_(self.eos_index) # we use <EOS> for <BOS> everywhere # positions positions = src_len.new(max_len).long() positions = torch.arange(max_len, out=positions).unsqueeze(1).expand(max_len, bs) # language IDs langs = src_len.new(max_len).long().fill_(tgt_lang_id) langs = langs.unsqueeze(1).expand(max_len, bs) # current position / max lengths / length of generated sentences / unfinished sentences cur_len = 1 gen_len = src_len.clone().fill_(1) unfinished_sents = src_len.clone().fill_(1) # cache compute states cache = {'slen': 0} while cur_len < max_len: # compute word scores tensor = self.forward( 'fwd', x=generated[:cur_len], lengths=gen_len, positions=positions[:cur_len], langs=langs[:cur_len], causal=True, src_enc=src_enc, src_len=src_len, cache=cache ) assert tensor.size() == (1, bs, self.dim), (cur_len, max_len, src_enc.size(), tensor.size(), (1, bs, self.dim)) tensor = tensor.data[-1, :, :].type_as(src_enc) # (bs, dim) scores = self.pred_layer.get_scores(tensor) # (bs, n_words) # select next words: sample or greedy if sample_temperature is None: if self.mask_gen_lang is True: next_words = torch.topk(scores, self.mask_topk)[1].squeeze(1) else: next_words = torch.topk(scores, 1)[1].squeeze(1) else: if self.mask_gen_lang is True: next_words = torch.multinomial(F.softmax(scores / sample_temperature, dim=1), self.mask_topk).squeeze(1) else: next_words = torch.multinomial(F.softmax(scores / sample_temperature, dim=1), 1).squeeze(1) if self.mask_gen_lang is True: tmp_next_words = torch.zeros(bs, dtype=torch.long) for j, next_word in enumerate(next_words.cpu()): has_tgt_id = False for i, wi in enumerate(next_word): if language_detect(self.dico.id2word[wi.item()], self.id2lang[tgt_lang_id]): has_tgt_id = True tmp_next_words[j] = wi break if has_tgt_id is False: tmp_next_words[j] = next_words[j, 0] next_words = tmp_next_words.cuda() assert next_words.size() == (bs,) # update generations / lengths / finished sentences / current length generated[cur_len] = next_words * unfinished_sents + self.pad_index * (1 - unfinished_sents) gen_len.add_(unfinished_sents) unfinished_sents.mul_(next_words.ne(self.eos_index).long()) cur_len = cur_len + 1 # stop when there is a </s> in each sentence, or if we exceed the maximul length if unfinished_sents.max() == 0: break # add <EOS> to unfinished sentences if cur_len == max_len: generated[-1].masked_fill_(unfinished_sents.byte(), self.eos_index) # sanity check assert (generated == self.eos_index).sum() == 2 * bs return generated[:cur_len], gen_len
def forward(self, x): x = self.features(x) x = x.view(x.size(0), -1) x = self.classifier(x) x = F.softmax(x, dim=1) return x
('instance_norm', (S, S, S), (non_differentiable(torch.zeros(S)), non_differentiable(torch.ones(S))),), ('layer_norm', (S, S, S, S), ([5],), '', (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])), ('layer_norm', (S, S, S, S), ([5], non_differentiable(torch.rand(S)),), 'with_only_weight', (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])), ('layer_norm', (S, S, S, S), ([5], None, non_differentiable(torch.rand(S)),), 'with_only_bias', (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])), ('layer_norm', (S, S, S, S), ([5], non_differentiable(torch.rand(S)), non_differentiable(torch.rand(S))), 'with_weight_and_bias', (False, ['aten::contiguous', 'aten::_batch_norm_impl_index', 'aten::addcmul'])), ('group_norm', (S, S, S), (1, torch.rand(5),),), ('local_response_norm', (S, S, S), (2, ),), ('nll_loss', F.log_softmax(torch.randn(3, 5), dim=0), (torch.tensor([1, 0, 4]),), '',), ('poisson_nll_loss', torch.rand(S, 2), (torch.rand(S, 2),),), ('poisson_nll_loss', torch.rand(S, 2), (torch.rand(S, 2), True, True), 'full'), ('kl_div', F.log_softmax(torch.randn(S, 10), 1), (F.softmax(torch.randn(S, 10), 1),),), ('cross_entropy', (3, S), (torch.randint(S, (3,), dtype=torch.int64),),), ('binary_cross_entropy_with_logits', (3,), (torch.empty(3).random_(2), ),), ('smooth_l1_loss', (3, S), (non_differentiable(torch.rand(3, S)),),), ('huber_loss', (3, S), (non_differentiable(torch.rand(3, S)),),), ('l1_loss', (3, S), (non_differentiable(torch.rand(3, S)),),), ('mse_loss', (3, S), (non_differentiable(torch.rand(3, S)),),), ('smooth_l1_loss', (3, S), ((torch.rand(3, S)),), 'with_grad'), ('huber_loss', (3, S), ((torch.rand(3, S)),), 'with_grad'), ('l1_loss', (3, S), ((torch.rand(3, S)),), 'with_grad'), ('mse_loss', (3, S), ((torch.rand(3, S)),), 'with_grad'), ('margin_ranking_loss', (S,), ((S,), (S,)),), ('hinge_embedding_loss', (3, S), (non_differentiable(torch.rand(3, S)),),), ('soft_margin_loss', (3, S), (non_differentiable(torch.rand(3, S)),),), ('multilabel_soft_margin_loss', (3, S), (non_differentiable(torch.rand(3, S)),),), ('cosine_embedding_loss', (S, S), ((S, S), non_differentiable(torch.rand(S,))),),
def siamese_track(state, im): refine_enable = True mask_enable = True device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu') debug = True p = state['p'] net = state['net'] avg_chans = state['avg_chans'] window = state['window'] targets = state["targets"] zf_lists = [] BLUE = [255, 255, 255] for i, target in enumerate(targets): wc_x = target["target_sz"][1] + p.context_amount * sum(target["target_sz"]) hc_x = target["target_sz"][0] + p.context_amount * sum(target["target_sz"]) target["s_z"] = np.sqrt(wc_x * hc_x) target["scale_x"] = p.exemplar_size / target["s_z"] d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / target["scale_x"] target["s_z"] = target["s_z"] + 2 * pad target["crop_box"] = [target["target_pos"][0] - round(target["s_z"]) / 2, target["target_pos"][1] - round(target["s_z"]) / 2, round(target["s_z"]), round(target["s_z"])] zf_lists.append(target["zf"]) crop_box = target["crop_box"] # extract scaled crops for search region x at previous target position targets = get_subwindow_tracking(im, p.instance_size, avg_chans, targets=targets) # x_crop = Variable(get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) tracking_data_list = [] tracking_data = dict() for target, zf in zip(targets, zf_lists): target["x_crop"] = Variable(target["im_to_torch"].unsqueeze(0)) target["x_crop"] = target["x_crop"].to(device) tracking_data_list.append({"x_crop": target["x_crop"], "zf": zf}) if mask_enable: results = net.track_mask(search=targets[0]["x_crop"], lists=tracking_data_list) # else: # score, delta = net.track(x_crop.to(device)) for result in results: delta = result["rpn_pred_loc"] score = result["rpn_pred_cls"] delta = delta.permute(1, 2, 3, 0).contiguous().view(4, -1).data.cpu().numpy() score = F.softmax(score.permute(1, 2, 3, 0).contiguous().view(2, -1).permute(1, 0), dim=1).data[:, 1].cpu().numpy() delta[0, :] = delta[0, :] * p.anchor[:, 2] + p.anchor[:, 0] delta[1, :] = delta[1, :] * p.anchor[:, 3] + p.anchor[:, 1] delta[2, :] = np.exp(delta[2, :]) * p.anchor[:, 2] delta[3, :] = np.exp(delta[3, :]) * p.anchor[:, 3] result["rpn_pred_loc"] = delta result["rpn_pred_cls"] = score def change(r): return np.maximum(r, 1. / r) def sz(w, h): pad = (w + h) * 0.5 sz2 = (w + pad) * (h + pad) return np.sqrt(sz2) def sz_wh(wh): pad = (wh[0] + wh[1]) * 0.5 sz2 = (wh[0] + pad) * (wh[1] + pad) return np.sqrt(sz2) # size penalty count = 0 for target, result in zip(targets, results): delta = result["rpn_pred_loc"] score = result["rpn_pred_cls"] crop_box = target["crop_box"] target_sz_in_crop = target["target_sz"] * target["scale_x"] s_c = change(sz(delta[2, :], delta[3, :]) / (sz_wh(target_sz_in_crop))) # scale penalty r_c = change((target_sz_in_crop[0] / target_sz_in_crop[1]) / (delta[2, :] / delta[3, :])) # ratio penalty penalty = np.exp(-(r_c * s_c - 1) * p.penalty_k) pscore = penalty * score pscore = pscore * (1 - p.window_influence) + window * p.window_influence best_pscore_id = np.argmax(pscore) pred_in_crop = delta[:, best_pscore_id] / target["scale_x"] lr = penalty[best_pscore_id] * score[best_pscore_id] * p.lr # lr for OTB res_x = pred_in_crop[0] + target["target_pos"][0] res_y = pred_in_crop[1] + target["target_pos"][1] res_w = target["target_sz"][0] * (1 - lr) + pred_in_crop[2] * lr res_h = target["target_sz"][1] * (1 - lr) + pred_in_crop[3] * lr target["target_pos"] = np.array([res_x, res_y]) target["target_sz"] = np.array([res_w, res_h]) if mask_enable: best_pscore_id_mask = np.unravel_index(best_pscore_id, (5, p.score_size, p.score_size)) delta_x, delta_y = best_pscore_id_mask[2], best_pscore_id_mask[1] if refine_enable: mask = net.track_refine((delta_y, delta_x), index=count).to(device).sigmoid().squeeze().view( p.out_size, p.out_size).cpu().data.numpy() else: mask = mask[0, :, delta_y, delta_x].sigmoid(). \ squeeze().view(p.out_size, p.out_size).cpu().data.numpy() count += 1 def crop_back(image, bbox, out_sz, padding=-1): a = (out_sz[0] - 1) / bbox[2] b = (out_sz[1] - 1) / bbox[3] c = -a * bbox[0] d = -b * bbox[1] mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float) crop = cv2.warpAffine(image, mapping, (out_sz[0], out_sz[1]), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT, borderValue=padding) return crop s = crop_box[2] / p.instance_size sub_box = [crop_box[0] + (delta_x - p.base_size / 2) * p.total_stride * s, crop_box[1] + (delta_y - p.base_size / 2) * p.total_stride * s, s * p.exemplar_size, s * p.exemplar_size] s = p.out_size / sub_box[2] back_box = [-sub_box[0] * s, -sub_box[1] * s, state['im_w'] * s, state['im_h'] * s] mask_in_img = crop_back(mask, back_box, (state['im_w'], state['im_h'])) target_mask = (mask_in_img > p.seg_thr).astype(np.uint8) if cv2.__version__[-5] == '4': contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) else: _, contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) cnt_area = [cv2.contourArea(cnt) for cnt in contours] if len(contours) != 0 and np.max(cnt_area) > 100: contour = contours[np.argmax(cnt_area)] # use max area polygon polygon = contour.reshape(-1, 2) # pbox = cv2.boundingRect(polygon) # Min Max Rectangle prbox = cv2.boxPoints(cv2.minAreaRect(polygon)) # Rotated Rectangle # box_in_img = pbox rbox_in_img = prbox else: # empty mask location = cxy_wh_2_rect(target["target_pos"], target["target_sz"]) rbox_in_img = np.array([[location[0], location[1]], [location[0] + location[2], location[1]], [location[0] + location[2], location[1] + location[3]], [location[0], location[1] + location[3]]]) target["target_pos"][0] = max(0, min(state['im_w'], target["target_pos"][0])) target["target_pos"][1] = max(0, min(state['im_h'], target["target_pos"][1])) target["target_sz"][0] = max(10, min(state['im_w'], target["target_sz"][0])) target["target_sz"][1] = max(10, min(state['im_h'], target["target_sz"][1])) # print("new targetPos {} and targetsize {} \n".format(target["target_pos"],target["target_sz"])) target["mask"] = mask_in_img if mask_enable else [] target['ploygon'] = rbox_in_img if mask_enable else [] target["score"] = score[best_pscore_id] state["targets"] = targets return state
def test_model(model, hist, criterion, dataloaders, dataset_sizes, half=False): """ Testing function. Print the loss and accuracy after the inference on the testset. """ print("\n\n**TESTING**\n") sincetime = time.time() phase = "test" model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 list_y_pred = [] list_y_true = [] list_probs = [] nb_batches = len(dataloaders[phase]) pbar = tqdm.tqdm([i for i in range(nb_batches)]) # Iterate over data. for batch_idx, (inputs, labels) in enumerate(dataloaders[phase]): pbar.update() pbar.set_description("Processing batch %s" % str(batch_idx + 1)) inputs = inputs.to(DEVICE) labels = labels.to(DEVICE) # After Quantization if half: inputs = inputs.half() # forward # track history if only in train with torch.set_grad_enabled(False): outputs = model(inputs) _, preds = torch.max(outputs, 1) probs = softmax(outputs, 1) loss = criterion(outputs, labels) # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) list_y_pred.append(int(preds.cpu())) list_y_true.append(int(labels.data.cpu())) list_probs.append(probs.cpu()) pbar.close() test_loss = running_loss / dataset_sizes[phase] test_acc = running_corrects.double() / dataset_sizes[phase] test_acc = round(float(test_acc), 4) hist['test_acc'] = test_acc hist['y_pred'] = list_y_pred hist['probs'] = np.stack(list_probs).reshape(-1, 3) hist['y_true'] = list_y_true print('\nTest stats - Loss: {:.4f} Acc: {:.2f}%'.format( test_loss, test_acc * 100)) print("Inference on Testset complete in {:.1f}s\n".format(time.time() - sincetime)) return hist
def softmax(self, inp, h): raw_score = inp.bmm(h.unsqueeze(2)) score = F.softmax(raw_score, dim=1) return score
def soft_attn(self, direction): di0 = self.bn_di0(direction) di = F.relu(self.bn_di(self.fc_di(di0))) x_di = self.fc_attn(di) attn = F.softmax(x_di, 1) return attn
def output(task_name, immediate_output_dict): module_name = f"{task_name}_pred_head" return F.softmax(immediate_output_dict[module_name], dim=1)
def forward(self, x): output = self._forward(x) proba = F.softmax(output, dim=1) return proba
def forward(self, x): x = self.affine1(x) x = F.relu(x) x = F.relu(self.affine2(x)) action_scores = self.affine3(x) return F.softmax(action_scores, dim=1)
def get_pred(x): if resize: x = up(x) x = inception_model(x) return F.softmax(x).data.cpu().numpy()
def train(self): progress = sly.Progress('Model training: ', self.epochs * self.train_iters) self.model.train() lr_decr = self.config['lr_decreasing'] policy = LRPolicyWithPatience( optim_cls=Adam, init_lr=self.config['lr'], patience=lr_decr['patience'], lr_divisor=lr_decr['lr_divisor'], model=self.model ) best_val_loss = float('inf') debug_saver = None debug_save_prob = float(os.getenv('DEBUG_PATCHES_PROB', 0.0)) if debug_save_prob > 0: target_multi = int(255.0 / len(self.out_classes)) debug_saver = DebugSaver(odir=os.path.join(sly.TaskPaths.DEBUG_DIR, 'debug_patches'), prob=debug_save_prob, target_multi=target_multi) for epoch in range(self.epochs): sly.logger.info("Before new epoch", extra={'epoch': self.epoch_flt}) for train_it, (inputs_cpu, targets_cpu) in enumerate(self.data_loaders['train']): inputs, targets = cuda_variable(inputs_cpu), cuda_variable(targets_cpu) outputs = self.model(inputs) loss = self.criterion(outputs, targets) if debug_saver is not None: out_cls = functional.softmax(outputs, dim=1) debug_saver.process(inputs_cpu, targets_cpu, out_cls.data.cpu()) policy.optimizer.zero_grad() loss.backward() policy.optimizer.step() metric_values_train = {'loss': loss.data[0]} for name, metric in self.metrics.items(): metric_values_train[name] = metric(outputs, targets) progress.iter_done_report() self.epoch_flt = epoch_float(epoch, train_it + 1, self.train_iters) sly.report_metrics_training(self.epoch_flt, metric_values_train) if self.eval_planner.need_validation(self.epoch_flt): metrics_values_val = self._validation() self.eval_planner.validation_performed() val_loss = metrics_values_val['loss'] model_is_best = val_loss < best_val_loss if model_is_best: best_val_loss = val_loss sly.logger.info('It\'s been determined that current model is the best one for a while.') self._save_model_snapshot(model_is_best, opt_data={ 'epoch': self.epoch_flt, 'val_metrics': metrics_values_val, }) policy.reset_if_needed(val_loss, self.model) sly.logger.info("Epoch was finished", extra={'epoch': self.epoch_flt})
def _softmax(x): return F.softmax(x, dim=-1)
def forward(self, inp): scores = F.softmax(self.scorer(inp), dim=1) cont = scores.transpose(1, 2).bmm(inp).squeeze(1) return cont
def predict_class_probs(self, x): probs = F.softmax(self.forward(x), dim=1) return probs
def decode(self, z): d1 = F.softmax(self.fcd1(z), dim=1) return d1
def forward(self, records_u, is_train): predicted_scores = Variable( torch.zeros(records_u.get_predicting_records_cnt( mod=0), self.nb_cnt + 1)) if is_train else [] records_al = records_u.get_records( mod=0) if is_train else records_u.get_records(mod=2) vids_visited = set( [record.vid for record in records_u.get_records(mod=0)]) emb_u = self.embedder_u( Variable(torch.LongTensor([records_u.uid])).view(1, -1)).view(1, -1) emb_u = F.relu(emb_u) emb_t_al = Variable(torch.zeros(len(records_al), self.emb_dim_t)) hidden_long_al = Variable(torch.zeros(len(records_al), self.hidden_dim)) hidden_short_al = Variable( torch.zeros(len(records_al), self.hidden_dim)) feature_al = Variable(torch.zeros(len(records_al), self.att_dim)) hidden_long = self.init_hidden() hidden_short = self.init_hidden() for idx, record in enumerate( records_u.get_records(mod=0)): # can only use train data if record.is_first: hidden_short = self.init_hidden() emb_t_al[idx] = F.relu( self.embedder_t( Variable(torch.LongTensor([record.tid])).view(1, -1)).view( 1, -1)) # current time embedding feature_al[idx] = torch.cat( (F.relu(hidden_long), F.relu(hidden_short), emb_t_al[idx].view( 1, -1)), 1) emb_v = self.embedder_v( Variable(torch.LongTensor([record.vid])).view(1, -1)).view( 1, -1) # feature: current time + previous hiddens hidden_long = self.rnn_long(emb_v, hidden_long) hidden_short = self.rnn_short(emb_v, hidden_short) hidden_long_al[idx] = F.relu(hidden_long) hidden_short_al[idx] = F.relu(hidden_short) id = 0 id_vids_true = [] id_vids = [] for idx, record in enumerate(records_al): if idx >= records_u.test_idx: # append the states of testing records if record.is_first: hidden_short = self.init_hidden() emb_t_al[idx] = F.relu( self.embedder_t( Variable(torch.LongTensor([record.tid ])).view(1, -1)).view(1, -1)) feature_al[idx] = torch.cat( (F.relu(hidden_long), F.relu(hidden_short), emb_t_al[idx].view(1, -1)), 1) emb_v = self.embedder_v( Variable(torch.LongTensor([record.vid ])).view(1, -1)).view(1, -1) hidden_long = self.rnn_long(emb_v, hidden_long) hidden_short = self.rnn_short(emb_v, hidden_short) hidden_long_al[idx] = F.relu(hidden_long) hidden_short_al[idx] = F.relu(hidden_short) if record.is_last or (not is_train and idx < records_u.test_idx): continue vids_visited.add(record.vid) vid_candidates = self.get_vids_candidate(record.rid, vids_visited, record.vid_next, is_train) id_vids_true.append(record.vid_next) id_vids.append(vid_candidates) scores_u = self.decoder_u( emb_u, Variable(torch.LongTensor(vid_candidates)).view(1, -1)) scores_t = self.decoder_t( emb_t_al[idx + 1].view(1, -1), Variable(torch.LongTensor(vid_candidates)).view(1, -1)) scores_hl = self.decoder_hl( hidden_long_al[idx].view(1, -1), Variable(torch.LongTensor(vid_candidates)).view(1, -1)) scores_hs = self.decoder_hs( hidden_short_al[idx].view(1, -1), Variable(torch.LongTensor(vid_candidates)).view(1, -1)) scores_d_all = self.get_scores_d_all(records_u, idx, vid_candidates, feature_al, is_train) if self.mod == 0: scores_merge = torch.cat( (scores_u, scores_t, scores_hl, scores_hs, scores_d_all), 0).t() if is_train: predicted_scores[id] = F.sigmoid( F.linear(scores_merge, F.relu(self.merger_weight), bias=None).t()) else: predicted_scores.append( F.softmax( F.linear(scores_merge, F.relu(self.merger_weight), bias=None).t())) elif self.mod == 1: scores_d_pre = self.get_scores_d_pre(records_u, idx, vid_candidates, feature_al, is_train) scores_merge = torch.cat( (scores_u, scores_t, scores_hl, scores_hs, scores_d_all, scores_d_pre), 0).t() if is_train: predicted_scores[id] = F.sigmoid( F.linear(scores_merge, F.relu(self.merger_weight), bias=None).t()) else: predicted_scores.append( F.softmax( F.linear(scores_merge, F.relu(self.merger_weight), bias=None).t())) elif self.mod == 2: scores_d_pre = self.get_scores_d_pre(records_u, idx, vid_candidates, feature_al, is_train) scores_merge = torch.cat( (scores_u, scores_t, scores_hl, scores_hs, scores_d_all, scores_d_pre), 0).t() gap_time = (records_al[idx + 1].dt - record.dt).total_seconds() / 60 / 60 gap_time_int = int(gap_time) weight_lower = gap_time_int + 1 - gap_time weight_upper = gap_time - gap_time_int merger_weight_linear = F.relu(self.merger_weight_al[ gap_time_int]) * weight_lower + F.relu( self.merger_weight_al[gap_time_int + 1]) * weight_upper scores_pre_final = F.linear(scores_merge, merger_weight_linear, bias=None).t() if is_train: predicted_scores[id] = F.sigmoid(scores_pre_final) else: predicted_scores.append(F.softmax(scores_pre_final)) elif self.mod == 3: scores_d_pre = self.get_scores_d_pre(records_u, idx, vid_candidates, feature_al, is_train) scores_merge = torch.cat( (scores_u, scores_t, scores_hl, scores_hs, scores_d_all, scores_d_pre), 0).t() gap_time = (records_al[idx + 1].dt - record.dt).total_seconds() / 60 / 60 gap_time_int = int(gap_time) if is_train: predicted_scores[id] = F.sigmoid( F.linear(scores_merge, F.relu(self.merger_weight_al[gap_time_int]), bias=None).t()) else: predicted_scores.append( F.softmax( F.linear(scores_merge, F.relu( self.merger_weight_al[gap_time_int]), bias=None).t())) id += 1 return predicted_scores, id_vids, id_vids_true
def inference(self, label_score, k=1): label_prob = F.softmax(label_score, dim=-1) label_prob, label_pred = label_prob.data.topk(k) return label_prob, label_pred
def evaluate_metrics(p, img_dict, model='resnet', times=1, metrics=[], outpath_root='.', labs_vs_gt=None): GT = labs_vs_gt[0] labs = labs_vs_gt[1] num_imgs = len(img_dict) model = model if model == 'resnet': arch = models.resnet18(pretrained=True).eval() elif model == 'vgg': arch = models.vgg16(pretrained=True).eval() elif model == 'alexnet': arch = models.alexnet(pretrained=True).eval() if torch.cuda.is_available(): arch = arch.cuda() start = time.time() now = start times = times average_drop, increase_in_confidence = 0.0, 0.0 deletion, insertion = [], [] if metrics is not []: for _ in range(times): for i, (k, img) in enumerate(img_dict.items()): outpath = outpath_root + f'{k}_{img}/' inp_0 = load_image(p + '/' + img) os.mkdir(outpath) inp_0.save(f'{outpath}{img}') inp = apply_transforms(inp_0) if torch.cuda.is_available(): inp = inp.cuda() #print(f'Before test.run: {round(time.time() - now, 0)}s') now = time.time() out, scorecam_map = expmap.get_explanation_map(arch=model, img=p + '/' + img) F.to_pil_image( scorecam_map.squeeze(0)).save(f'{outpath}/exp_map.png') #print(f'After test.run: {round(time.time() - now, 0)}s') now = time.time() if torch.cuda.is_available(): scorecam_map = scorecam_map.cuda() #print(f'Before arch: {round(time.time() - now, 0)}s') now = time.time() out_sal = FF.softmax(arch(inp * scorecam_map), dim=1) #print(f'After arch: {round(time.time() - now, 0)}s') now = time.time() # print(type(out_sal),out_sal.shape) Y_i_c = out.max(1)[0].item() class_idx = out.max(1)[-1].item() class_name = labs[class_idx] gt_name = GT[str(img[-13:-5])][0].split()[1] O_i_c = out_sal[:, class_idx][0].item() # print(f'#-------------------------------------------------------------------#') # print(f'{Y_i_c},{out.max(1)[-1].item()},\n{O_i_c},{out_sal.max(1)[-1].item()}\n') # print(f'{Y_i_c},{O_i_c},{max(0.0,Y_i_c-O_i_c)},{max(0,Y_i_c-O_i_c)/Y_i_c}') # print('#-------------------------------------------------------------------#') if 'average_drop' in metrics and 'increase_in_confidence' in metrics: average_drop, increase_in_confidence = ADIC.average_drop_and_increase_of_confidence( average_drop, increase_in_confidence, Y_i_c, O_i_c) if 'deletion' in metrics and 'insertion' in metrics: precision = 100 deletion, insertion = DAI.deletion_and_insertion( deletion, insertion, inp, scorecam_map, arch, step=1 / precision) #print(deletion, insertion) #deletion_score = round(torch.tensor(deletion).sum().item() / precision,3) #insertion_score = round(torch.tensor(insertion).sum().item() / precision,3) deletion_score = round( SKM.auc( torch.arange(0, 1, 1 / precision).numpy(), torch.tensor(deletion).numpy()), 3) insertion_score = round( SKM.auc( torch.arange(0, 1, 1 / precision).numpy(), torch.tensor(insertion).numpy()), 3) plot(torch.arange(0, 1, 1 / precision), [deletion, insertion], label=[ f'deletion={deletion_score}', f'insertion={insertion_score}' ], path=f'{outpath}plot_{k}.png', title=f'label={class_name}, GT={gt_name}') print(f'The final deletion is: {deletion_score}') print(f'The final insertion is: {insertion_score}') deletion, insertion = [], [] print(f'After one img: {int(time.time() - now)}s') now = time.time() print(f'In {num_imgs} images') if 'average_drop' in metrics and 'increase_in_confidence' in metrics: average_drop *= 100 / num_imgs increase_in_confidence *= 100 / num_imgs print(f'The final AVG drop is: {round(average_drop, 2)}%') print( f'The final Increase in Confidence is: {round(increase_in_confidence, 2)}%' ) print(f'Execution time: {int(time.time() - start)}s')
def forward(self, v, q, v_mask, q_mask): """ v: visual feature [batch, num_obj, feat_size] q: question [batch, max_len, feat_size] v_mask [batch, num_obj] q_mask [batch, max_len] """ batch_size, num_obj = v_mask.shape _, max_len = q_mask.shape # transfor features v_trans = self.v_lin(v) q_trans = self.q_lin(q) # mask all padding object/word features if APPLY_MASK: v_trans = v_trans * v_mask.unsqueeze(2) q_trans = q_trans * q_mask.unsqueeze(2) # split for different use of purpose v_key, v_qry, v_val = torch.split(v_trans, v_trans.size(2) // 3, dim=2) q_key, q_qry, q_val = torch.split(q_trans, q_trans.size(2) // 3, dim=2) # apply multi-head v_key_set = torch.split(v_key, v_key.size(2) // self.num_head, dim=2) v_qry_set = torch.split(v_qry, v_qry.size(2) // self.num_head, dim=2) v_val_set = torch.split(v_val, v_val.size(2) // self.num_head, dim=2) q_key_set = torch.split(q_key, q_key.size(2) // self.num_head, dim=2) q_qry_set = torch.split(q_qry, q_qry.size(2) // self.num_head, dim=2) q_val_set = torch.split(q_val, q_val.size(2) // self.num_head, dim=2) # multi-head for i in range(self.num_head): v_key_slice, v_qry_slice, v_val_slice = v_key_set[i], v_qry_set[ i], v_val_set[i] #[batch, num_obj, feat_size] q_key_slice, q_qry_slice, q_val_slice = q_key_set[i], q_qry_set[ i], q_val_set[i] #[batch, max_len, feat_size] # inner product & set padding object/word attention to negative infinity & normalized by square root of hidden dimension q2v = (v_qry_slice @ q_key_slice.transpose(1, 2)) / ( (self.output_size // self.num_head)**0.5 ) #[batch, num_obj, max_len] v2q = (q_qry_slice @ v_key_slice.transpose(1, 2)) / ( (self.output_size // self.num_head)**0.5 ) #[batch, max_len, num_obj] if APPLY_MASK: q2v.masked_fill_( q_mask.unsqueeze(1).expand([batch_size, num_obj, max_len]) == 0, -float('inf')) v2q.masked_fill_( v_mask.unsqueeze(1).expand([batch_size, max_len, num_obj]) == 0, -float('inf')) # softmax attention interMAF_q2v = F.softmax(q2v, dim=2).unsqueeze( 3) #[batch, num_obj, max_len, 1] interMAF_v2q = F.softmax(v2q, dim=2).unsqueeze( 3) #[batch, max_len, num_obj, 1] # calculate update input (each head of multi-head is calculated independently and concatenate together) v_update = (interMAF_q2v * q_val_slice.unsqueeze(1)).sum(2) if ( i == 0) else torch.cat( (v_update, (interMAF_q2v * q_val_slice.unsqueeze(1)).sum(2)), dim=2) q_update = (interMAF_v2q * v_val_slice.unsqueeze(1)).sum(2) if ( i == 0) else torch.cat( (q_update, (interMAF_v2q * v_val_slice.unsqueeze(1)).sum(2)), dim=2) # update new feature cat_v = torch.cat((v, v_update), dim=2) cat_q = torch.cat((q, q_update), dim=2) updated_v = self.v_output(cat_v) updated_q = self.q_output(cat_q) return updated_v, updated_q
def masked_unk_softmax(self, x, dim, mask_idx): x1 = F.softmax(x, dim=dim) x1[:, mask_idx] = 0 x1_sum = torch.sum(x1, dim=1, keepdim=True) y = x1 / x1_sum return y
img_name = sample['img_name'][0] img = cv2.imread(img_name) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) sample2 = {} sample2['img'] = img sample2 = transform(sample2) sample['img'] = sample2['img'] img = sample['img'] img = img.unsqueeze(0).to(device) #img_name = sample['img_name'] with torch.no_grad(): seg_pred, exist_pred = net(img)[:2] seg_pred = F.softmax(seg_pred, dim=1) seg_pred = seg_pred.cpu().numpy() exist_pred = exist_pred.cpu().numpy() b=0 seg = seg_pred[b] exist = [1 if exist_pred[b, i] > 0.5 else 0 for i in range(4)] if dataset_name == 'Tusimple': lane_coords = getLane.prob2lines_tusimple(seg, exist, resize_shape=original_shape[::-1], y_px_gap=10, pts=56) elif dataset_name == 'CULane': lane_coords = getLane.prob2lines_CULane(seg, exist, resize_shape=original_shape[::-1], y_px_gap=20, pts=18) for i in range(len(lane_coords)): lane_coords[i] = sorted(lane_coords[i], key=lambda pair: pair[1]) img_vis = cv2.imread(img_name) #img_vis = cv2.cvtColor(img_vis, cv2.COLOR_BGR2RGB)
def forward(self, v, q, v_mask, q_mask): """ v: visual feature [batch, num_obj, feat_size] q: question [batch, max_len, feat_size] v_mask [batch, num_obj] q_mask [batch, max_len] """ batch_size, num_obj = v_mask.shape _, max_len = q_mask.shape # conditioned gating vector if APPLY_MASK: v_mean = (v * v_mask.unsqueeze(2)).sum(1) / v_mask.sum(1).unsqueeze(1) q_mean = (q * q_mask.unsqueeze(2)).sum(1) / q_mask.sum(1).unsqueeze(1) else: v_mean = v.sum(1) / num_obj q_mean = q.sum(1) / max_len v4q_gate = self.sigmoid(self.v4q_gate_lin(v_mean)).unsqueeze( 1) #[batch, 1, feat_size] q4v_gate = self.sigmoid(self.q4v_gate_lin(q_mean)).unsqueeze( 1) #[batch, 1, feat_size] # key, query, value v_trans = self.v_lin(v) q_trans = self.q_lin(q) # mask all padding object/word features if APPLY_MASK: v_trans = v_trans * v_mask.unsqueeze(2) q_trans = q_trans * q_mask.unsqueeze(2) # split for different use of purpose v_key, v_qry, v_val = torch.split(v_trans, v_trans.size(2) // 3, dim=2) q_key, q_qry, q_val = torch.split(q_trans, q_trans.size(2) // 3, dim=2) # apply conditioned gate gated_v_qry = (1 + q4v_gate) * v_qry gated_v_key = (1 + q4v_gate) * v_key gated_v_val = (1 + q4v_gate) * v_val gated_q_qry = (1 + v4q_gate) * q_qry gated_q_key = (1 + v4q_gate) * q_key gated_q_val = (1 + v4q_gate) * q_val # apply multi-head v_key_set = torch.split(gated_v_key, gated_v_key.size(2) // self.num_head, dim=2) v_qry_set = torch.split(gated_v_qry, gated_v_qry.size(2) // self.num_head, dim=2) v_val_set = torch.split(gated_v_val, gated_v_val.size(2) // self.num_head, dim=2) q_key_set = torch.split(gated_q_key, gated_q_key.size(2) // self.num_head, dim=2) q_qry_set = torch.split(gated_q_qry, gated_q_qry.size(2) // self.num_head, dim=2) q_val_set = torch.split(gated_q_val, gated_q_val.size(2) // self.num_head, dim=2) # multi-head for i in range(self.num_head): v_key_slice, v_qry_slice, v_val_slice = v_key_set[i], v_qry_set[ i], v_val_set[i] #[batch, num_obj, feat_size] q_key_slice, q_qry_slice, q_val_slice = q_key_set[i], q_qry_set[ i], q_val_set[i] #[batch, max_len, feat_size] # calculate attention v2v = (v_qry_slice @ v_key_slice.transpose(1, 2)) / ( (self.output_size // self.num_head)**0.5) q2q = (q_qry_slice @ q_key_slice.transpose(1, 2)) / ( (self.output_size // self.num_head)**0.5) if APPLY_MASK: v2v.masked_fill_( v_mask.unsqueeze(1).expand([batch_size, num_obj, num_obj]) == 0, -float('inf')) q2q.masked_fill_( q_mask.unsqueeze(1).expand([batch_size, max_len, max_len]) == 0, -float('inf')) dyIntraMAF_v2v = F.softmax(v2v, dim=2).unsqueeze( 3) #[batch, num_obj, num_obj, 1] dyIntraMAF_q2q = F.softmax(q2q, dim=2).unsqueeze( 3) #[batch, max_len, max_len, 1] # calculate update input v_update = (dyIntraMAF_v2v * v_val_slice.unsqueeze(1)).sum(2) if ( i == 0) else torch.cat( (v_update, (dyIntraMAF_v2v * v_val_slice.unsqueeze(1)).sum(2)), dim=2) q_update = (dyIntraMAF_q2q * q_val_slice.unsqueeze(1)).sum(2) if ( i == 0) else torch.cat( (q_update, (dyIntraMAF_q2q * q_val_slice.unsqueeze(1)).sum(2)), dim=2) # update updated_v = self.v_output(v + v_update) updated_q = self.q_output(q + q_update) return updated_v, updated_q
def forward(self, x): """ The input should be of size [batch_size, 3, img_h, img_w] """ _, _, img_h, img_w = x.size() cfg._tmp_img_h = img_h cfg._tmp_img_w = img_w with timer.env('backbone'): outs = self.backbone(x) if cfg.fpn is not None: with timer.env('fpn'): # Use backbone.selected_layers because we overwrote self.selected_layers outs = [outs[i] for i in cfg.backbone.selected_layers] outs = self.fpn(outs) proto_out = None if cfg.mask_type == mask_type.lincomb and cfg.eval_mask_branch: with timer.env('proto'): proto_x = x if self.proto_src is None else outs[self.proto_src] if self.num_grids > 0: grids = self.grid.repeat(proto_x.size(0), 1, 1, 1) proto_x = torch.cat([proto_x, grids], dim=1) proto_out = self.proto_net(proto_x) proto_out = cfg.mask_proto_prototype_activation(proto_out) if cfg.mask_proto_prototypes_as_features: # Clone here because we don't want to permute this, though idk if contiguous makes this unnecessary proto_downsampled = proto_out.clone() if cfg.mask_proto_prototypes_as_features_no_grad: proto_downsampled = proto_out.detach() # Move the features last so the multiplication is easy proto_out = proto_out.permute(0, 2, 3, 1).contiguous() if cfg.mask_proto_bias: bias_shape = [x for x in proto_out.size()] bias_shape[-1] = 1 proto_out = torch.cat( [proto_out, torch.ones(*bias_shape)], -1) with timer.env('pred_heads'): pred_outs = {'loc': [], 'conf': [], 'mask': [], 'priors': []} if cfg.use_mask_scoring: pred_outs['score'] = [] if cfg.use_instance_coeff: pred_outs['inst'] = [] for idx, pred_layer in zip(self.selected_layers, self.prediction_layers): pred_x = outs[idx] if cfg.mask_type == mask_type.lincomb and cfg.mask_proto_prototypes_as_features: # Scale the prototypes down to the current prediction layer's size and add it as inputs proto_downsampled = F.interpolate( proto_downsampled, size=outs[idx].size()[2:], mode='bilinear', align_corners=False) pred_x = torch.cat([pred_x, proto_downsampled], dim=1) # A hack for the way dataparallel works if cfg.share_prediction_module and pred_layer is not self.prediction_layers[ 0]: pred_layer.parent = [self.prediction_layers[0]] p = pred_layer(pred_x) for k, v in p.items(): pred_outs[k].append(v) for k, v in pred_outs.items(): pred_outs[k] = torch.cat(v, -2) if proto_out is not None: pred_outs['proto'] = proto_out if self.training: # For the extra loss functions if cfg.use_class_existence_loss: pred_outs['classes'] = self.class_existence_fc( outs[-1].mean(dim=(2, 3))) if cfg.use_semantic_segmentation_loss: pred_outs['segm'] = self.semantic_seg_conv(outs[0]) return pred_outs else: if cfg.use_mask_scoring: pred_outs['score'] = torch.sigmoid(pred_outs['score']) if cfg.use_focal_loss: if cfg.use_sigmoid_focal_loss: # Note: even though conf[0] exists, this mode doesn't train it so don't use it pred_outs['conf'] = torch.sigmoid(pred_outs['conf']) if cfg.use_mask_scoring: pred_outs['conf'] *= pred_outs['score'] elif cfg.use_objectness_score: # See focal_loss_sigmoid in multibox_loss.py for details objectness = torch.sigmoid(pred_outs['conf'][:, :, 0]) pred_outs['conf'][:, :, 1:] = objectness[:, :, None] * F.softmax( pred_outs['conf'][:, :, 1:], -1) pred_outs['conf'][:, :, 0] = 1 - objectness else: pred_outs['conf'] = F.softmax(pred_outs['conf'], -1) else: if cfg.use_objectness_score: objectness = torch.sigmoid(pred_outs['conf'][:, :, 0]) pred_outs['conf'][:, :, 1:] = (objectness > 0.10)[..., None] \ * F.softmax(pred_outs['conf'][:, :, 1:], dim=-1) else: pred_outs['conf'] = F.softmax(pred_outs['conf'], -1) return self.detect(pred_outs, self)
def train(self, save_embedding=False): self.set_mode('train') preprocessor = self.data_loader['train'].dataset.preprocessor temp_min = 0.1 anneal_rate = self.anneal_rate temp = 1. total_loss = 0. total_step = 0 for e in range(self.epoch): self.global_epoch += 1 pred_word_labels = [] gold_word_labels = [] pred_phone_labels = [] gold_phone_labels = [] for b_idx, (audios, phoneme_labels, word_labels,\ audio_masks, phone_masks, word_masks)\ in enumerate(self.data_loader['train']): if b_idx > 2 and self.debug: break self.global_iter += 1 x = cuda(audios, self.cuda) if self.audio_feature == "wav2vec2": x = self.audio_feature_net.feature_extractor(x) phoneme_labels = cuda(phoneme_labels, self.cuda) word_labels = cuda(word_labels, self.cuda) audio_masks = cuda(audio_masks, self.cuda) phone_masks = cuda(phone_masks, self.cuda) word_masks = cuda(word_masks, self.cuda) if self.audio_net.ds_ratio > 1: audio_masks = audio_masks[:, ::self.audio_net.ds_ratio] word_masks = word_masks[:, :, ::self.audio_net.ds_ratio] audio_lens = audio_masks.sum(-1).long() sent_lens = phone_masks.sum(-1).long() word_lens = (word_labels >= 0).long().sum(-1) phone_logits, word_logits, _, embedding = self.audio_net( x, masks=audio_masks, temp=temp, num_sample=self.num_sample, return_feat=True) # Compute phoneme one-hot vector phoneme_vectors = F.one_hot(phoneme_labels, self.n_phone_class) phone_denoised_logits,\ phone_word_logits,\ denoised_encodings,\ embedding = self.phone_net(phoneme_vectors, temp=temp, num_sample=self.num_sample, return_feat=True) quantized = None if self.model_type == 'vq-mlp': word_logits = out_logits[:, :, :self.n_visual_class] quantized = out_logits[:, :, self.n_visual_class:] word_logits = torch.matmul(word_masks, word_logits) word_loss = F.cross_entropy(word_logits.permute(0, 2, 1), word_labels,\ ignore_index=-100, ).div(math.log(2)) info_loss = (F.softmax(phone_logits, dim=-1)\ * F.log_softmax(phone_logits, dim=-1) ).sum().div(audio_lens.sum()*math.log(2)) # Permutation-invariant CTC loss for multilingual phones batch_size = x.size(0) phone_word_losses = [] num_words = np.where(word_masks.sum(-1) > 0, torch.tensor(1, device=x.device), torch.tensor(0, device=x.device)).sum(-1) for idx in range(batch_size): word_orders = list(itertools.permutations(range(num_words[idx]))) word_orders = word_orders[:200] # Limit the number of order phone_word_losses.append(torch.max( [F.ctc_loss(F.log_softmax(phone_denoised_logits[idx], dim=-1)\ .permute(1, 0, 2), word_labels[word_order], sent_lens[idx], num_words[idx]) for word_order in word_orders] ) ) phone_word_loss = torch.sum(phone_word_losses) phone_info_loss = (F.softmax(phone_denoised_logits, dim=-1)\ * F.log_softmax(phone_denoised_logits, dim=-1) ).sum().div(sent_lens.sum()*math.log(2)) # Use denoised phoneme labels for training the phoneme classifier phone_word_encodings = F.gumbel_softmax(phone_word_logits, tau=temp, dim=-1) denoising_mask = torch.where(phone_word_encodings.max(-1)[1].detach() > 0, torch.tensor(1, device=x.device), torch.tensor(0, device=x.device)).detach() phoneme_labels_denoised = denoising_mask * denoised_encodings.max(-1)[1].detach()\ + (1 - denoising_mask) * phoneme_labels phone_loss = F.ctc_loss(F.log_softmax(phone_logits, dim=-1)\ .permute(1, 0, 2), phoneme_labels_denoised, audio_lens, sent_lens) audio_ib_loss = self.weight_phone_loss * phone_loss\ + self.weight_word_loss * word_loss\ + self.beta * info_loss\ phone_ib_loss = self.weight_phone_word_loss * phone_word_loss\ + self.beta * phone_info_loss # TODO weight_phone_word loss = audio_ib_loss + phone_ib_loss if self.model_type == 'vq-mlp': loss += self.audio_net.quantize_loss(embedding, quantized, masks=audio_masks) izy_bound = math.log(self.n_visual_class, 2) - word_loss izx_bound = info_loss total_loss += loss.cpu().detach().numpy() total_step += 1. self.optim.zero_grad() loss.backward() if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm_( self.audio_net.parameters(), self.max_grad_norm ) self.optim.step() for i in range(audios.size(0)): audio_len = audio_lens[i] sent_len = sent_lens[i] word_len = word_lens[i] gold_phone_label = phoneme_labels_denoised[i, :sent_len] pred_phone_label = phone_logits[i, :audio_len].max(-1)[1] gold_phone_labels.append(gold_phone_label.cpu().detach().numpy().tolist()) pred_phone_labels.append(pred_phone_label.cpu().detach().numpy().tolist()) if word_len > 0: gold_word_labels.append(word_labels[i, :word_len].cpu().detach().numpy().tolist()) pred_word_label = word_logits[i, :word_len].max(-1)[1] pred_word_labels.append(pred_word_label.cpu().detach().numpy().tolist()) if self.global_iter % 1000 == 0: temp = np.maximum(temp * np.exp(-anneal_rate * b_idx), temp_min) avg_loss = total_loss / total_step print(f'i:{self.global_iter:d} temp:{temp} avg loss (total loss):{avg_loss:.2f} ({total_loss:.2f}) ' f'IZY:{izy_bound:.2f} IZX:{izx_bound:.2f}' f'phone_loss:{phone_loss:.5f} phone_word_loss:{phone_word_loss:.5f}') # Evaluate training visual word classification accuracy and phone token error rate acc = compute_accuracy(gold_word_labels, pred_word_labels) dist, n_tokens = compute_edit_distance(pred_phone_labels, gold_phone_labels, preprocessor) pter = float(dist) / float(n_tokens) print(f'Epoch {self.global_epoch}\ttraining visual word accuracy: {acc:.3f}\ttraining phone token error rate: {pter:.3f}') if (self.global_epoch % 2) == 0: self.scheduler.step() self.test(save_embedding=save_embedding)