def __call__(self, scores, temperature=1., dim=-1): is_training = self.rop.training # only use stochastic at training if is_training: if self.use_gumbel: gumbel_eps = self.gumbel_eps G = (BK.rand(BK.get_shape(scores)) + gumbel_eps).clamp( max=1.) # [0,1) scores = scores - (gumbel_eps - G.log()).log() # normalize probs = BK.softmax(scores / temperature, dim=dim) # [*, S] # prune and re-normalize? if self.prune_val > 0.: probs = probs * (probs > self.prune_val).float() # todo(note): currently no re-normalize # probs = probs / probs.sum(dim=dim, keepdim=True) # [*, S] # argmax and ste if self.use_argmax: # use the hard argmax max_probs, _ = probs.max(dim, keepdim=True) # [*, 1] # todo(+N): currently we do not re-normalize here, should it be done here? st_probs = (probs >= max_probs).float() * probs # [*, S] if is_training: # (hard-soft).detach() + soft st_probs = (st_probs - probs).detach() + probs # [*, S] return st_probs else: return probs
def __call__(self, word_arr: np.ndarray = None, char_arr: np.ndarray = None, extra_arrs: Iterable[np.ndarray] = (), aux_arrs: Iterable[np.ndarray] = ()): exprs = [] # word/char/extras/posi seq_shape = None if self.has_word: # todo(warn): singleton-UNK-dropout should be done outside before seq_shape = word_arr.shape word_expr = self.dropmd_word(self.word_embed(word_arr)) exprs.append(word_expr) if self.has_char: seq_shape = char_arr.shape[:-1] char_embeds = self.char_embed( char_arr) # [*, seq-len, word-len, D] char_cat_expr = self.dropmd_char( BK.concat([z(char_embeds) for z in self.char_cnns])) exprs.append(char_cat_expr) zcheck( len(extra_arrs) == len(self.extra_embeds), "Unmatched extra fields.") for one_extra_arr, one_extra_embed, one_extra_dropmd in zip( extra_arrs, self.extra_embeds, self.dropmd_extras): seq_shape = one_extra_arr.shape exprs.append(one_extra_dropmd(one_extra_embed(one_extra_arr))) if self.has_posi: seq_len = seq_shape[-1] posi_idxes = BK.arange_idx(seq_len) posi_input0 = self.posi_embed(posi_idxes) for _ in range(len(seq_shape) - 1): posi_input0 = BK.unsqueeze(posi_input0, 0) posi_input1 = BK.expand(posi_input0, tuple(seq_shape) + (-1, )) exprs.append(posi_input1) # assert len(aux_arrs) == len(self.drop_auxes) for one_aux_arr, one_aux_dim, one_aux_drop, one_fold, one_gamma, one_lambdas in \ zip(aux_arrs, self.dim_auxes, self.drop_auxes, self.fold_auxes, self.aux_overall_gammas, self.aux_fold_lambdas): # fold and apply trainable lambdas input_aux_repr = BK.input_real(one_aux_arr) input_shape = BK.get_shape(input_aux_repr) # todo(note): assume the original concat is [fold/layer, D] reshaped_aux_repr = input_aux_repr.view( input_shape[:-1] + [one_fold, one_aux_dim]) # [*, slen, fold, D] lambdas_softmax = BK.softmax(one_gamma, -1).unsqueeze(-1) # [fold, 1] weighted_aux_repr = (reshaped_aux_repr * lambdas_softmax ).sum(-2) * one_gamma # [*, slen, D] one_aux_expr = one_aux_drop(weighted_aux_repr) exprs.append(one_aux_expr) # concated_exprs = BK.concat(exprs, dim=-1) # optional proj if self.has_proj: final_expr = self.final_layer(concated_exprs) else: final_expr = concated_exprs return final_expr
def __call__(self, input_repr, mask_arr, require_loss, require_pred, gold_pos_arr=None): enc0_expr = self.enc(input_repr, mask_arr) # [*, len, d] # enc1_expr = enc0_expr pos_probs, pos_losses_expr, pos_preds_expr = None, None, None if self.jpos_multitask: # get probabilities pos_logits = self.pred(enc0_expr) # [*, len, nl] pos_probs = BK.softmax(pos_logits, dim=-1) # stacking for input -> output if self.jpos_stacking: enc1_expr = enc0_expr + BK.matmul(pos_probs, self.pos_weights) # simple cross entropy loss if require_loss and self.jpos_lambda > 0.: gold_probs = BK.gather_one_lastdim( pos_probs, gold_pos_arr).squeeze(-1) # [*, len] # todo(warn): multiplying the factor here, but not maksing here (masking in the final steps) pos_losses_expr = (-self.jpos_lambda) * gold_probs.log() # simple argmax for prediction if require_pred and self.jpos_decode: pos_preds_expr = pos_probs.max(dim=-1)[1] return enc1_expr, (pos_probs, pos_losses_expr, pos_preds_expr)
def _my_loss_prob(self, score_expr, gold_idxes_expr, entropy_lambda: float, loss_mask, neg_reweight: bool): probs = BK.softmax(score_expr, -1) # [*, NLab] log_probs = BK.log(probs + 1e-8) # first plain NLL loss nll_loss = -BK.gather_one_lastdim(log_probs, gold_idxes_expr).squeeze(-1) # next the special loss if entropy_lambda > 0.: negative_entropy = probs * log_probs # [*, NLab] last_dim = BK.get_shape(score_expr, -1) confusion_matrix = 1. - BK.eye(last_dim) # [Nlab, Nlab] entropy_mask = confusion_matrix[gold_idxes_expr] # [*, Nlab] entropy_loss = (negative_entropy * entropy_mask).sum(-1) final_loss = nll_loss + entropy_lambda * entropy_loss else: final_loss = nll_loss # reweight? if neg_reweight: golden_prob = BK.gather_one_lastdim(probs, gold_idxes_expr).squeeze(-1) is_full_nil = (gold_idxes_expr == 0.).float() not_full_nil = 1. - is_full_nil count_pos = (loss_mask * not_full_nil).sum() count_neg = (loss_mask * is_full_nil).sum() prob_pos = (loss_mask * not_full_nil * golden_prob).sum() prob_neg = (loss_mask * is_full_nil * golden_prob).sum() neg_weight = prob_pos / (count_pos + count_neg - prob_neg + 1e-8) final_weights = not_full_nil + is_full_nil * neg_weight # todo(note): final mask will be applied at outside final_loss = final_loss * final_weights return final_loss
def _score(self, input_expr, input_mask, scores_aug_tok=None, scores_aug_sent=None): # token level attention and score # calculate the attention query_tok = self.query_tok # [L, D] query_tok_t = query_tok.transpose(0, 1) # [D, L] att_scores = BK.matmul(input_expr, query_tok_t) # [*, slen, L] att_scores += (1. - input_mask).unsqueeze(-1) * Constants.REAL_PRAC_MIN if scores_aug_tok is not None: # margin att_scores += scores_aug_tok attn = BK.softmax(att_scores, -2) # [*, slen, L] score_tok = (att_scores * attn).sum(-2) # [*, L] # token level labeling softmax attn2 = BK.softmax( att_scores.view(BK.get_shape(att_scores)[:-2] + [-1]), -1) # [*, slen*L] # sent level score query_sent = self.query_sent # [L, D] context_sent = input_expr[:, 0] + input_expr[:, -1] # [*, D], simply adding the two ends score_sent = BK.matmul(context_sent, self.query_sent.transpose(0, 1)) # [*, L] # combine if self.lambda_score_tok < 0.: context_tok = BK.matmul(input_expr.transpose( -1, -2), attn).transpose(-1, -2).contiguous() # [*, L, D] # 4*[*,L,D] -> [*, L] cur_lambda_score_tok = self.score_gate([ context_tok, query_tok.unsqueeze(0), context_sent.unsqueeze(-2), query_sent.unsqueeze(0) ]).squeeze(-1) else: cur_lambda_score_tok = self.lambda_score_tok final_score = score_tok * cur_lambda_score_tok + score_sent * ( 1. - cur_lambda_score_tok) if scores_aug_sent is not None: final_score += scores_aug_sent if self.conf.score_sigmoid: # margin final_score = BK.sigmoid(final_score) return final_score, attn, attn2 # [*, L], [*, slen, L], [*, slen*L]
def fb_on_batch(self, annotated_insts, training=True, loss_factor=1, **kwargs): self.refresh_batch(training) margin = self.margin.value # gold heads and labels gold_heads_arr, _ = self.predict_padder.pad( [z.heads.vals for z in annotated_insts]) gold_labels_arr, _ = self.predict_padder.pad( [self.real2pred_labels(z.labels.idxes) for z in annotated_insts]) gold_heads_expr = BK.input_idx(gold_heads_arr) # [BS, Len] gold_labels_expr = BK.input_idx(gold_labels_arr) # [BS, Len] # ===== calculate scoring_expr_pack, mask_expr, jpos_pack = self._prepare_score( annotated_insts, training) full_arc_score = self._score_arc_full(scoring_expr_pack, mask_expr, training, margin, gold_heads_expr) # final_losses = None if self.norm_local or self.norm_single: select_label_score = self._score_label_selected( scoring_expr_pack, mask_expr, training, margin, gold_heads_expr, gold_labels_expr) # already added margin previously losses_heads = losses_labels = None if self.loss_prob: if self.norm_local: losses_heads = BK.loss_nll(full_arc_score, gold_heads_expr) losses_labels = BK.loss_nll(select_label_score, gold_labels_expr) elif self.norm_single: single_sample = self.conf.tconf.loss_single_sample losses_heads = self._losses_single(full_arc_score, gold_heads_expr, single_sample, is_hinge=False) losses_labels = self._losses_single(select_label_score, gold_labels_expr, single_sample, is_hinge=False) # simply adding final_losses = losses_heads + losses_labels elif self.loss_hinge: if self.norm_local: losses_heads = BK.loss_hinge(full_arc_score, gold_heads_expr) losses_labels = BK.loss_hinge(select_label_score, gold_labels_expr) elif self.norm_single: single_sample = self.conf.tconf.loss_single_sample losses_heads = self._losses_single(full_arc_score, gold_heads_expr, single_sample, is_hinge=True, margin=margin) losses_labels = self._losses_single(select_label_score, gold_labels_expr, single_sample, is_hinge=True, margin=margin) # simply adding final_losses = losses_heads + losses_labels elif self.loss_mr: # special treatment! probs_heads = BK.softmax(full_arc_score, dim=-1) # [bs, m, h] probs_labels = BK.softmax(select_label_score, dim=-1) # [bs, m, h] # select probs_head_gold = BK.gather_one_lastdim( probs_heads, gold_heads_expr).squeeze(-1) # [bs, m] probs_label_gold = BK.gather_one_lastdim( probs_labels, gold_labels_expr).squeeze(-1) # [bs, m] # root and pad will be excluded later # Reward = \sum_i 1.*marginal(GEdge_i); while for global models, need to gradient on marginal-functions # todo(warn): have problem since steps will be quite small, not used! final_losses = (mask_expr - probs_head_gold * probs_label_gold ) # let loss>=0 elif self.norm_global: full_label_score = self._score_label_full(scoring_expr_pack, mask_expr, training, margin, gold_heads_expr, gold_labels_expr) # for this one, use the merged full score full_score = full_arc_score.unsqueeze( -1) + full_label_score # [BS, m, h, L] # +=1 to include ROOT for mst decoding mst_lengths_arr = np.asarray([len(z) + 1 for z in annotated_insts], dtype=np.int32) # do inference if self.loss_prob: marginals_expr = self._marginal( full_score, mask_expr, mst_lengths_arr) # [BS, m, h, L] final_losses = self._losses_global_prob( full_score, gold_heads_expr, gold_labels_expr, marginals_expr, mask_expr) if self.alg_proj: # todo(+N): deal with search-error-like problem, discard unproj neg losses (score>weighted-avg), # but this might be too loose, although the unproj edges are few? gold_unproj_arr, _ = self.predict_padder.pad( [z.unprojs for z in annotated_insts]) gold_unproj_expr = BK.input_real( gold_unproj_arr) # [BS, Len] comparing_expr = Constants.REAL_PRAC_MIN * ( 1. - gold_unproj_expr) final_losses = BK.max_elem(final_losses, comparing_expr) elif self.loss_hinge: pred_heads_arr, pred_labels_arr, _ = self._decode( full_score, mask_expr, mst_lengths_arr) pred_heads_expr = BK.input_idx(pred_heads_arr) # [BS, Len] pred_labels_expr = BK.input_idx(pred_labels_arr) # [BS, Len] # final_losses = self._losses_global_hinge( full_score, gold_heads_expr, gold_labels_expr, pred_heads_expr, pred_labels_expr, mask_expr) elif self.loss_mr: # todo(+N): Loss = -Reward = \sum marginals, which requires gradients on marginal-one-edge, or marginal-two-edges raise NotImplementedError( "Not implemented for global-loss + mr.") elif self.norm_hlocal: # firstly label losses are the same select_label_score = self._score_label_selected( scoring_expr_pack, mask_expr, training, margin, gold_heads_expr, gold_labels_expr) losses_labels = BK.loss_nll(select_label_score, gold_labels_expr) # then specially for arc loss children_masks_arr, _ = self.hlocal_padder.pad( [z.get_children_mask_arr() for z in annotated_insts]) children_masks_expr = BK.input_real( children_masks_arr) # [bs, h, m] # [bs, h] # todo(warn): use prod rather than sum, but still only an approximation for the top-down # losses_arc = -BK.log(BK.sum(BK.softmax(full_arc_score, -2).transpose(-1, -2) * children_masks_expr, dim=-1) + (1-mask_expr)) losses_arc = -BK.sum(BK.log_softmax(full_arc_score, -2).transpose( -1, -2) * children_masks_expr, dim=-1) # including the root-head is important losses_arc[:, 1] += losses_arc[:, 0] final_losses = losses_arc + losses_labels # # jpos loss? (the same mask as parsing) jpos_losses_expr = jpos_pack[1] if jpos_losses_expr is not None: final_losses += jpos_losses_expr # collect loss with mask, also excluding the first symbol of ROOT final_losses_masked = (final_losses * mask_expr)[:, 1:] final_loss_sum = BK.sum(final_losses_masked) # divide loss by what? num_sent = len(annotated_insts) num_valid_tok = sum(len(z) for z in annotated_insts) if self.conf.tconf.loss_div_tok: final_loss = final_loss_sum / num_valid_tok else: final_loss = final_loss_sum / num_sent # final_loss_sum_val = float(BK.get_value(final_loss_sum)) info = { "sent": num_sent, "tok": num_valid_tok, "loss_sum": final_loss_sum_val } if training: BK.backward(final_loss, loss_factor) return info
def __call__(self, bert_t): lambdas_softmax = BK.softmax(self.bert_lambdas, -1).unsqueeze(-1) # [fold, 1] weighted_bert_t = (bert_t * lambdas_softmax).sum(-2) * self.bert_gamma # [*, D] return weighted_bert_t