Exemple #1
0
 def __call__(self, scores, temperature=1., dim=-1):
     is_training = self.rop.training
     # only use stochastic at training
     if is_training:
         if self.use_gumbel:
             gumbel_eps = self.gumbel_eps
             G = (BK.rand(BK.get_shape(scores)) + gumbel_eps).clamp(
                 max=1.)  # [0,1)
             scores = scores - (gumbel_eps - G.log()).log()
     # normalize
     probs = BK.softmax(scores / temperature, dim=dim)  # [*, S]
     # prune and re-normalize?
     if self.prune_val > 0.:
         probs = probs * (probs > self.prune_val).float()
         # todo(note): currently no re-normalize
         # probs = probs / probs.sum(dim=dim, keepdim=True)  # [*, S]
     # argmax and ste
     if self.use_argmax:  # use the hard argmax
         max_probs, _ = probs.max(dim, keepdim=True)  # [*, 1]
         # todo(+N): currently we do not re-normalize here, should it be done here?
         st_probs = (probs >= max_probs).float() * probs  # [*, S]
         if is_training:  # (hard-soft).detach() + soft
             st_probs = (st_probs - probs).detach() + probs  # [*, S]
         return st_probs
     else:
         return probs
Exemple #2
0
 def __call__(self,
              word_arr: np.ndarray = None,
              char_arr: np.ndarray = None,
              extra_arrs: Iterable[np.ndarray] = (),
              aux_arrs: Iterable[np.ndarray] = ()):
     exprs = []
     # word/char/extras/posi
     seq_shape = None
     if self.has_word:
         # todo(warn): singleton-UNK-dropout should be done outside before
         seq_shape = word_arr.shape
         word_expr = self.dropmd_word(self.word_embed(word_arr))
         exprs.append(word_expr)
     if self.has_char:
         seq_shape = char_arr.shape[:-1]
         char_embeds = self.char_embed(
             char_arr)  # [*, seq-len, word-len, D]
         char_cat_expr = self.dropmd_char(
             BK.concat([z(char_embeds) for z in self.char_cnns]))
         exprs.append(char_cat_expr)
     zcheck(
         len(extra_arrs) == len(self.extra_embeds),
         "Unmatched extra fields.")
     for one_extra_arr, one_extra_embed, one_extra_dropmd in zip(
             extra_arrs, self.extra_embeds, self.dropmd_extras):
         seq_shape = one_extra_arr.shape
         exprs.append(one_extra_dropmd(one_extra_embed(one_extra_arr)))
     if self.has_posi:
         seq_len = seq_shape[-1]
         posi_idxes = BK.arange_idx(seq_len)
         posi_input0 = self.posi_embed(posi_idxes)
         for _ in range(len(seq_shape) - 1):
             posi_input0 = BK.unsqueeze(posi_input0, 0)
         posi_input1 = BK.expand(posi_input0, tuple(seq_shape) + (-1, ))
         exprs.append(posi_input1)
     #
     assert len(aux_arrs) == len(self.drop_auxes)
     for one_aux_arr, one_aux_dim, one_aux_drop, one_fold, one_gamma, one_lambdas in \
             zip(aux_arrs, self.dim_auxes, self.drop_auxes, self.fold_auxes, self.aux_overall_gammas, self.aux_fold_lambdas):
         # fold and apply trainable lambdas
         input_aux_repr = BK.input_real(one_aux_arr)
         input_shape = BK.get_shape(input_aux_repr)
         # todo(note): assume the original concat is [fold/layer, D]
         reshaped_aux_repr = input_aux_repr.view(
             input_shape[:-1] +
             [one_fold, one_aux_dim])  # [*, slen, fold, D]
         lambdas_softmax = BK.softmax(one_gamma,
                                      -1).unsqueeze(-1)  # [fold, 1]
         weighted_aux_repr = (reshaped_aux_repr * lambdas_softmax
                              ).sum(-2) * one_gamma  # [*, slen, D]
         one_aux_expr = one_aux_drop(weighted_aux_repr)
         exprs.append(one_aux_expr)
     #
     concated_exprs = BK.concat(exprs, dim=-1)
     # optional proj
     if self.has_proj:
         final_expr = self.final_layer(concated_exprs)
     else:
         final_expr = concated_exprs
     return final_expr
Exemple #3
0
 def __call__(self,
              input_repr,
              mask_arr,
              require_loss,
              require_pred,
              gold_pos_arr=None):
     enc0_expr = self.enc(input_repr, mask_arr)  # [*, len, d]
     #
     enc1_expr = enc0_expr
     pos_probs, pos_losses_expr, pos_preds_expr = None, None, None
     if self.jpos_multitask:
         # get probabilities
         pos_logits = self.pred(enc0_expr)  # [*, len, nl]
         pos_probs = BK.softmax(pos_logits, dim=-1)
         # stacking for input -> output
         if self.jpos_stacking:
             enc1_expr = enc0_expr + BK.matmul(pos_probs, self.pos_weights)
         # simple cross entropy loss
         if require_loss and self.jpos_lambda > 0.:
             gold_probs = BK.gather_one_lastdim(
                 pos_probs, gold_pos_arr).squeeze(-1)  # [*, len]
             # todo(warn): multiplying the factor here, but not maksing here (masking in the final steps)
             pos_losses_expr = (-self.jpos_lambda) * gold_probs.log()
         # simple argmax for prediction
         if require_pred and self.jpos_decode:
             pos_preds_expr = pos_probs.max(dim=-1)[1]
     return enc1_expr, (pos_probs, pos_losses_expr, pos_preds_expr)
Exemple #4
0
 def _my_loss_prob(self, score_expr, gold_idxes_expr, entropy_lambda: float,
                   loss_mask, neg_reweight: bool):
     probs = BK.softmax(score_expr, -1)  # [*, NLab]
     log_probs = BK.log(probs + 1e-8)
     # first plain NLL loss
     nll_loss = -BK.gather_one_lastdim(log_probs,
                                       gold_idxes_expr).squeeze(-1)
     # next the special loss
     if entropy_lambda > 0.:
         negative_entropy = probs * log_probs  # [*, NLab]
         last_dim = BK.get_shape(score_expr, -1)
         confusion_matrix = 1. - BK.eye(last_dim)  # [Nlab, Nlab]
         entropy_mask = confusion_matrix[gold_idxes_expr]  # [*, Nlab]
         entropy_loss = (negative_entropy * entropy_mask).sum(-1)
         final_loss = nll_loss + entropy_lambda * entropy_loss
     else:
         final_loss = nll_loss
     # reweight?
     if neg_reweight:
         golden_prob = BK.gather_one_lastdim(probs,
                                             gold_idxes_expr).squeeze(-1)
         is_full_nil = (gold_idxes_expr == 0.).float()
         not_full_nil = 1. - is_full_nil
         count_pos = (loss_mask * not_full_nil).sum()
         count_neg = (loss_mask * is_full_nil).sum()
         prob_pos = (loss_mask * not_full_nil * golden_prob).sum()
         prob_neg = (loss_mask * is_full_nil * golden_prob).sum()
         neg_weight = prob_pos / (count_pos + count_neg - prob_neg + 1e-8)
         final_weights = not_full_nil + is_full_nil * neg_weight
         # todo(note): final mask will be applied at outside
         final_loss = final_loss * final_weights
     return final_loss
Exemple #5
0
 def _score(self,
            input_expr,
            input_mask,
            scores_aug_tok=None,
            scores_aug_sent=None):
     # token level attention and score
     # calculate the attention
     query_tok = self.query_tok  # [L, D]
     query_tok_t = query_tok.transpose(0, 1)  # [D, L]
     att_scores = BK.matmul(input_expr, query_tok_t)  # [*, slen, L]
     att_scores += (1. - input_mask).unsqueeze(-1) * Constants.REAL_PRAC_MIN
     if scores_aug_tok is not None:  # margin
         att_scores += scores_aug_tok
     attn = BK.softmax(att_scores, -2)  # [*, slen, L]
     score_tok = (att_scores * attn).sum(-2)  # [*, L]
     # token level labeling softmax
     attn2 = BK.softmax(
         att_scores.view(BK.get_shape(att_scores)[:-2] + [-1]),
         -1)  # [*, slen*L]
     # sent level score
     query_sent = self.query_sent  # [L, D]
     context_sent = input_expr[:,
                               0] + input_expr[:,
                                               -1]  # [*, D], simply adding the two ends
     score_sent = BK.matmul(context_sent,
                            self.query_sent.transpose(0, 1))  # [*, L]
     # combine
     if self.lambda_score_tok < 0.:
         context_tok = BK.matmul(input_expr.transpose(
             -1, -2), attn).transpose(-1, -2).contiguous()  # [*, L, D]
         # 4*[*,L,D] -> [*, L]
         cur_lambda_score_tok = self.score_gate([
             context_tok,
             query_tok.unsqueeze(0),
             context_sent.unsqueeze(-2),
             query_sent.unsqueeze(0)
         ]).squeeze(-1)
     else:
         cur_lambda_score_tok = self.lambda_score_tok
     final_score = score_tok * cur_lambda_score_tok + score_sent * (
         1. - cur_lambda_score_tok)
     if scores_aug_sent is not None:
         final_score += scores_aug_sent
     if self.conf.score_sigmoid:  # margin
         final_score = BK.sigmoid(final_score)
     return final_score, attn, attn2  # [*, L], [*, slen, L], [*, slen*L]
Exemple #6
0
 def fb_on_batch(self,
                 annotated_insts,
                 training=True,
                 loss_factor=1,
                 **kwargs):
     self.refresh_batch(training)
     margin = self.margin.value
     # gold heads and labels
     gold_heads_arr, _ = self.predict_padder.pad(
         [z.heads.vals for z in annotated_insts])
     gold_labels_arr, _ = self.predict_padder.pad(
         [self.real2pred_labels(z.labels.idxes) for z in annotated_insts])
     gold_heads_expr = BK.input_idx(gold_heads_arr)  # [BS, Len]
     gold_labels_expr = BK.input_idx(gold_labels_arr)  # [BS, Len]
     # ===== calculate
     scoring_expr_pack, mask_expr, jpos_pack = self._prepare_score(
         annotated_insts, training)
     full_arc_score = self._score_arc_full(scoring_expr_pack, mask_expr,
                                           training, margin,
                                           gold_heads_expr)
     #
     final_losses = None
     if self.norm_local or self.norm_single:
         select_label_score = self._score_label_selected(
             scoring_expr_pack, mask_expr, training, margin,
             gold_heads_expr, gold_labels_expr)
         # already added margin previously
         losses_heads = losses_labels = None
         if self.loss_prob:
             if self.norm_local:
                 losses_heads = BK.loss_nll(full_arc_score, gold_heads_expr)
                 losses_labels = BK.loss_nll(select_label_score,
                                             gold_labels_expr)
             elif self.norm_single:
                 single_sample = self.conf.tconf.loss_single_sample
                 losses_heads = self._losses_single(full_arc_score,
                                                    gold_heads_expr,
                                                    single_sample,
                                                    is_hinge=False)
                 losses_labels = self._losses_single(select_label_score,
                                                     gold_labels_expr,
                                                     single_sample,
                                                     is_hinge=False)
             # simply adding
             final_losses = losses_heads + losses_labels
         elif self.loss_hinge:
             if self.norm_local:
                 losses_heads = BK.loss_hinge(full_arc_score,
                                              gold_heads_expr)
                 losses_labels = BK.loss_hinge(select_label_score,
                                               gold_labels_expr)
             elif self.norm_single:
                 single_sample = self.conf.tconf.loss_single_sample
                 losses_heads = self._losses_single(full_arc_score,
                                                    gold_heads_expr,
                                                    single_sample,
                                                    is_hinge=True,
                                                    margin=margin)
                 losses_labels = self._losses_single(select_label_score,
                                                     gold_labels_expr,
                                                     single_sample,
                                                     is_hinge=True,
                                                     margin=margin)
             # simply adding
             final_losses = losses_heads + losses_labels
         elif self.loss_mr:
             # special treatment!
             probs_heads = BK.softmax(full_arc_score, dim=-1)  # [bs, m, h]
             probs_labels = BK.softmax(select_label_score,
                                       dim=-1)  # [bs, m, h]
             # select
             probs_head_gold = BK.gather_one_lastdim(
                 probs_heads, gold_heads_expr).squeeze(-1)  # [bs, m]
             probs_label_gold = BK.gather_one_lastdim(
                 probs_labels, gold_labels_expr).squeeze(-1)  # [bs, m]
             # root and pad will be excluded later
             # Reward = \sum_i 1.*marginal(GEdge_i); while for global models, need to gradient on marginal-functions
             # todo(warn): have problem since steps will be quite small, not used!
             final_losses = (mask_expr - probs_head_gold * probs_label_gold
                             )  # let loss>=0
     elif self.norm_global:
         full_label_score = self._score_label_full(scoring_expr_pack,
                                                   mask_expr, training,
                                                   margin, gold_heads_expr,
                                                   gold_labels_expr)
         # for this one, use the merged full score
         full_score = full_arc_score.unsqueeze(
             -1) + full_label_score  # [BS, m, h, L]
         # +=1 to include ROOT for mst decoding
         mst_lengths_arr = np.asarray([len(z) + 1 for z in annotated_insts],
                                      dtype=np.int32)
         # do inference
         if self.loss_prob:
             marginals_expr = self._marginal(
                 full_score, mask_expr, mst_lengths_arr)  # [BS, m, h, L]
             final_losses = self._losses_global_prob(
                 full_score, gold_heads_expr, gold_labels_expr,
                 marginals_expr, mask_expr)
             if self.alg_proj:
                 # todo(+N): deal with search-error-like problem, discard unproj neg losses (score>weighted-avg),
                 #  but this might be too loose, although the unproj edges are few?
                 gold_unproj_arr, _ = self.predict_padder.pad(
                     [z.unprojs for z in annotated_insts])
                 gold_unproj_expr = BK.input_real(
                     gold_unproj_arr)  # [BS, Len]
                 comparing_expr = Constants.REAL_PRAC_MIN * (
                     1. - gold_unproj_expr)
                 final_losses = BK.max_elem(final_losses, comparing_expr)
         elif self.loss_hinge:
             pred_heads_arr, pred_labels_arr, _ = self._decode(
                 full_score, mask_expr, mst_lengths_arr)
             pred_heads_expr = BK.input_idx(pred_heads_arr)  # [BS, Len]
             pred_labels_expr = BK.input_idx(pred_labels_arr)  # [BS, Len]
             #
             final_losses = self._losses_global_hinge(
                 full_score, gold_heads_expr, gold_labels_expr,
                 pred_heads_expr, pred_labels_expr, mask_expr)
         elif self.loss_mr:
             # todo(+N): Loss = -Reward = \sum marginals, which requires gradients on marginal-one-edge, or marginal-two-edges
             raise NotImplementedError(
                 "Not implemented for global-loss + mr.")
     elif self.norm_hlocal:
         # firstly label losses are the same
         select_label_score = self._score_label_selected(
             scoring_expr_pack, mask_expr, training, margin,
             gold_heads_expr, gold_labels_expr)
         losses_labels = BK.loss_nll(select_label_score, gold_labels_expr)
         # then specially for arc loss
         children_masks_arr, _ = self.hlocal_padder.pad(
             [z.get_children_mask_arr() for z in annotated_insts])
         children_masks_expr = BK.input_real(
             children_masks_arr)  # [bs, h, m]
         # [bs, h]
         # todo(warn): use prod rather than sum, but still only an approximation for the top-down
         # losses_arc = -BK.log(BK.sum(BK.softmax(full_arc_score, -2).transpose(-1, -2) * children_masks_expr, dim=-1) + (1-mask_expr))
         losses_arc = -BK.sum(BK.log_softmax(full_arc_score, -2).transpose(
             -1, -2) * children_masks_expr,
                              dim=-1)
         # including the root-head is important
         losses_arc[:, 1] += losses_arc[:, 0]
         final_losses = losses_arc + losses_labels
     #
     # jpos loss? (the same mask as parsing)
     jpos_losses_expr = jpos_pack[1]
     if jpos_losses_expr is not None:
         final_losses += jpos_losses_expr
     # collect loss with mask, also excluding the first symbol of ROOT
     final_losses_masked = (final_losses * mask_expr)[:, 1:]
     final_loss_sum = BK.sum(final_losses_masked)
     # divide loss by what?
     num_sent = len(annotated_insts)
     num_valid_tok = sum(len(z) for z in annotated_insts)
     if self.conf.tconf.loss_div_tok:
         final_loss = final_loss_sum / num_valid_tok
     else:
         final_loss = final_loss_sum / num_sent
     #
     final_loss_sum_val = float(BK.get_value(final_loss_sum))
     info = {
         "sent": num_sent,
         "tok": num_valid_tok,
         "loss_sum": final_loss_sum_val
     }
     if training:
         BK.backward(final_loss, loss_factor)
     return info
Exemple #7
0
 def __call__(self, bert_t):
     lambdas_softmax = BK.softmax(self.bert_lambdas,
                                  -1).unsqueeze(-1)  # [fold, 1]
     weighted_bert_t = (bert_t *
                        lambdas_softmax).sum(-2) * self.bert_gamma  # [*, D]
     return weighted_bert_t