def get_logits(self, hidden): """Get all the logits. Parameters ---------- F hidden The hidden representation Shape (..., in_units) Returns ------- logits Shape (..., |V|) """ if self._cutoffs is None: if self._in_units != self._embed_size: hidden = self.inter_proj_l[0](hidden) logits = self.out_proj_l[0](hidden) return logits else: all_logits = [] if self._div_val == 1.0: if self._in_units == self._embed_size: all_scores = self.out_proj_l[0](hidden) tail_cluster_scores = self.tail_cluster_score_proj(hidden) else: inter_hidden = self.inter_proj_l[0](hidden) all_scores = self.out_proj_l[0](inter_hidden) tail_cluster_scores = self.tail_cluster_score_proj( inter_hidden) all_scores_l = np.split(all_scores, self._cutoffs, axis=-1) head_scores = all_scores_l[0] else: inter_hidden = self.inter_proj_l[0](hidden) head_scores = self.out_proj_l[0](inter_hidden) tail_cluster_scores = self.tail_cluster_score_proj( inter_hidden) head_tail_cluster_logits = \ npx.log_softmax(np.concatenate([head_scores, tail_cluster_scores], axis=-1), axis=-1) head_logits, tail_cluster_logits = \ np.split(head_tail_cluster_logits, [self._cutoffs[0]], axis=-1) tail_cluster_logits = np.split(tail_cluster_logits, self._num_tail_clusters, axis=-1) all_logits.append(head_logits) for i in range(1, len(self._cutoffs) + 1): if self._div_val == 1.0: ele_scores = all_scores_l[i] else: ele_scores = self.out_proj_l[i]( self.inter_proj_l[i](hidden)) ele_logits = npx.log_softmax(ele_scores, axis=-1) ele_logits = tail_cluster_logits[-i] + ele_logits all_logits.append(ele_logits) return np.concatenate(all_logits, axis=-1)
def masked_logsoftmax(att_score, mask, axis: int = -1): """Ignore the masked elements when calculating the softmax. The mask can be broadcastable. Parameters ---------- att_score : Symborl or NDArray Shape (..., length, ...) mask : Symbol or NDArray or None Shape (..., length, ...) mask = 1 --> not masked mask = 0 --> masked axis The axis to calculate the softmax. att_score.shape[axis] must be the same as mask.shape[axis] Returns ------- logits : Symborl or NDArray Shape (..., length, ...) The masked values will be all zero """ if mask is None: return npx.log_softmax(att_score, axis=axis) else: mask = mask.astype(np.bool) return np.where(mask, npx.masked_log_softmax(att_score, mask, axis=axis), -np.inf)
def get_answerable_logits(self, contextual_embedding, p_mask): """Get the answerable logits. Parameters ---------- contextual_embedding Shape (batch_size, sequence_length, C) p_mask Shape (batch_size, sequence_length) Mask the sequence. 0 --> Denote that the element is masked, 1 --> Denote that the element is not masked Returns ------- answerable_logits Shape (batch_size, 2) """ # Shape (batch_size, sequence_length) start_scores = np.squeeze(self.start_scores(contextual_embedding), -1) start_score_weights = masked_softmax(start_scores, p_mask, axis=-1) start_agg_feature = npx.batch_dot(np.expand_dims(start_score_weights, axis=1), contextual_embedding) start_agg_feature = np.squeeze(start_agg_feature, 1) cls_feature = contextual_embedding[:, 0, :] answerable_scores = self.answerable_scores(np.concatenate([start_agg_feature, cls_feature], axis=-1)) answerable_logits = npx.log_softmax(answerable_scores, axis=-1) return answerable_logits
def forward(self, logits, labels, length_ratio, source_length, target_length): """ :param logits: Model logits. Shape: (batch, length, vocab_size). :param labels: Gold targets. Shape: (batch, length). :param length_ratio: Length Ratios. Shape: (batch,). :param source_length: Source lengths. Shape: (batch,). :param target_length: Target lengths. Shape: (batch,). :return: Sequence scores. Shape: (batch,). """ logprobs = npx.log_softmax(logits, axis=-1, temperature=self.softmax_temperature) # Select the label probability, then take their logs. # probs and scores: (batch_size, target_seq_len) token_scores = npx.pick(logprobs, labels, axis=-1) if self.score_type == C.SCORING_TYPE_NEGLOGPROB: token_scores = token_scores * -1 # Sum, then apply length penalty. The call to `np.where` masks out invalid values from scores. # zeros and sums: (batch_size,) scores = np.sum(np.where(labels != 0, token_scores, np.zeros_like(token_scores)), axis=1) if self.constant_length_ratio is not None and self.constant_length_ratio > 0.0: predicted_output_length = source_length * self.constant_length_ratio else: predicted_output_length = source_length * length_ratio scores = self.scorer(scores, target_length, predicted_output_length) return scores
def forward(self, logits: np.ndarray, labels: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: pred = npx.log_softmax(logits, axis=-1) # (batch, len) neg_log_likelihood = - npx.pick(pred, # pylint: disable=invalid-unary-operand-type labels, axis=-1, keepdims=False) # label smoothing as in # https://github.com/dmlc/gluon-nlp/blob/b714eaccc67619d7bdcbd1574d30be87d9c73f0c/src/gluonnlp/loss.py#L4 if self._alpha > 0: all_scores = np.sum(pred, axis=-1) neg_log_likelihood = (1 - self._alpha) * neg_log_likelihood - self._alpha / self._num_labels * all_scores # (batch, len,) valid_mask = labels != self.ignore_label # (batch, len) loss = neg_log_likelihood * valid_mask # (1,) num_valid = np.sum(valid_mask) # (1,) ce = np.sum(loss) * self.weight # we need to divide by num_valid here to backpropagate a 'valid' normalized loss value like in SoftmaxOutput. return ce / num_valid, np.ones((1,))
def masked_logsoftmax(att_score, mask, dtype=np.float32, axis: int = -1): """Ignore the masked elements when calculating the softmax. The mask can be broadcastable. Parameters ---------- att_score : Symborl or NDArray Shape (..., length, ...) mask : Symbol or NDArray or None Shape (..., length, ...) mask = 1 --> not masked mask = 0 --> masked dtype data type axis The axis to calculate the softmax. att_score.shape[axis] must be the same as mask.shape[axis] Returns ------- logits : Symborl or NDArray Shape (..., length, ...) The masked values will be all zero """ if mask is not None: # Fill in the masked scores with a very small value neg = -1e18 if _np.dtype(dtype) == np.float16: neg = -1e4 else: try: # if AMP (automatic mixed precision) is enabled, -1e18 will cause NaN. from mxnet.contrib import amp if amp.amp._amp_initialized: neg = -1e4 except ImportError: pass att_score = np.where(mask, att_score, neg) logits = np.where(mask, npx.log_softmax(att_score, axis=axis), -np.inf) else: logits = npx.log_softmax(att_score, axis=axis) return logits
def decode_step(self, step_input: np.ndarray, states: List, vocab_slice_ids: Optional[np.ndarray] = None): logits, states, target_factor_outputs = self._model.decode_step(step_input, states, vocab_slice_ids) if not self._skip_softmax: logits = npx.log_softmax(logits, axis=-1, temperature=self._softmax_temperature) scores = -logits target_factors = None # type: Optional[np.ndarray] if target_factor_outputs: # target factors are greedily 'decoded'. factor_predictions = [npx.cast(np.expand_dims(np.argmax(tfo, axis=1), axis=1), dtype='int32') for tfo in target_factor_outputs] target_factors = factor_predictions[0] if len(factor_predictions) == 1 \ else np.concatenate(factor_predictions, axis=1) return scores, states, target_factors
def forward(self, pred, label): """ Parameters ---------- pred : The predictions of the network. Shape (..., V) label : The labels. Shape (..., ) Returns ------- loss : Shape (..., ) """ if not self._from_logits: pred = npx.log_softmax(pred, axis=-1) log_likelihood = npx.pick(pred, label, axis=-1) all_scores = pred.sum(axis=-1) loss = - (1 - self._alpha) * log_likelihood\ - self._alpha / float(self._num_labels) * all_scores return loss
def log_linear_interpolation(predictions): log_probs = utils.average_arrays([np.log(p) for p in predictions]) return -npx.log_softmax(log_probs) # pylint: disable=invalid-unary-operand-type