def forward(self, scores, target_dists, finished, best_hyp_indices): """ Choose an extension of each hypothesis from its softmax distribution. :param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size) :param target_dists: The non-cumulative target distributions (ignored). :param finished: The list of finished hypotheses. :param best_hyp_indices: Best hypothesis indices constant. :return: The row indices, column indices, and values of the sampled words. """ # Map the negative logprobs to probabilities so as to have a distribution target_dists = np.exp(-target_dists) # n == 0 means sample from the full vocabulary. Otherwise, we sample from the top n. if self.n != 0: # select the top n in each row, via a mask masked_items = npx.topk(target_dists, k=self.n, ret_typ='mask', axis=1, is_ascend=False) # set unmasked items to 0 masked_items = np.where(masked_items, target_dists, masked_items) # renormalize target_dists = masked_items / np.sum(masked_items, axis=1, keepdims=True) # Sample from the target distributions over words, then get the corresponding values from the cumulative scores best_word_indices = npx.random.categorical(target_dists, get_prob=False) # Zeroes for finished hypotheses. best_word_indices = np.where(finished, np.zeros_like(best_word_indices), best_word_indices) values = npx.pick(scores, best_word_indices, axis=1, keepdims=True) best_hyp_indices = npx.slice_like(best_hyp_indices, best_word_indices, axes=(0,)) return best_hyp_indices, best_word_indices, values
def forward(self, target_dists, finished, inactive, scores_accumulated, lengths, max_lengths, unk_dist, pad_dist, eos_dist): # make sure to avoid generating <unk> if unk_dist is specified if unk_dist is not None: target_dists = target_dists + unk_dist # broadcast hypothesis score to each prediction. # scores_accumulated. Shape: (batch*beam, 1) # target_dists. Shape: (batch*beam, vocab_size) scores = target_dists + scores_accumulated # Special treatment for finished and inactive rows. Inactive rows are inf everywhere; # finished rows are inf everywhere except column zero (pad_id), which holds the accumulated model score. # Items that are finished (but not inactive) get their previous accumulated score for the <pad> symbol, # infinity otherwise. # pad_dist. Shape: (batch*beam, vocab_size) pad_dist = np.concatenate((scores_accumulated, pad_dist), axis=1) scores = np.where(np.logical_or(finished, inactive), pad_dist, scores) # Update lengths of all items, except those that were already finished. This updates # the lengths for inactive items, too, but that doesn't matter since they are ignored anyway. lengths = lengths + (1 - finished) # Items that are at their maximum length and not finished now are forced to produce the <eos> symbol. # That is, we keep scores for hypotheses below max length or finished, and 'force-eos' the rest. below_max_length = lengths < max_lengths scores = np.where(np.logical_or(below_max_length, finished), scores, eos_dist + scores) return scores, lengths
def plot_loss_wrong_preds(history, x=None, y=None, yhat=None, labels=None): #Plot the train loss and val loss plt.plot(history[0], label='train_err') plt.plot(history[1], label='val_err') plt.title('Error trend') plt.xlabel("Epochs") plt.ylabel("Loss") plt.legend() plt.show() #Show some misclassified samples if (labels): #get some wrong predictions mis_idx = np.where(y != yhat)[0] #get some correct predictions idx = np.where(y == yhat)[0] wrong_preds = np.random.choice(mis_idx, size=min(len(mis_idx), 8)) correct_preds = np.random.choice(idx, size=min(len(idx), 8)) ax = [] fig = plt.figure(figsize=(12, 12)) columns = 4 rows = 4 for i, j in enumerate(correct_preds): j = j.item() if (type(x) is np.ndarray): img = x[j] else: img = plt.imread(x[j]) ax.append(fig.add_subplot(rows, columns, i + 1)) ax[-1].set_title( f'true: {labels[y[j].item()]}, pred: {labels[yhat[j].item()]}', color='g') plt.imshow(img) for i, j in enumerate(wrong_preds): j = j.item() if (type(x) is np.ndarray): img = x[j] else: img = plt.imread(x[j]) ax.append(fig.add_subplot(rows, columns, i + 9)) ax[-1].set_title( f'true: {labels[y[j].item()]}, pred: {labels[yhat[j].item()]}', color='r') plt.imshow(img) plt.tight_layout(pad=1.2) fig.suptitle('Sample Correct(8) & Wrong predictions(8)', y=0.001) plt.show()
def masked_logsoftmax(att_score, mask, axis: int = -1): """Ignore the masked elements when calculating the softmax. The mask can be broadcastable. Parameters ---------- att_score : Symborl or NDArray Shape (..., length, ...) mask : Symbol or NDArray or None Shape (..., length, ...) mask = 1 --> not masked mask = 0 --> masked axis The axis to calculate the softmax. att_score.shape[axis] must be the same as mask.shape[axis] Returns ------- logits : Symborl or NDArray Shape (..., length, ...) The masked values will be all zero """ if mask is None: return npx.log_softmax(att_score, axis=axis) else: mask = mask.astype(np.bool) return np.where(mask, npx.masked_log_softmax(att_score, mask, axis=axis), -np.inf)
def forward(self, logits, labels, length_ratio, source_length, target_length): """ :param logits: Model logits. Shape: (batch, length, vocab_size). :param labels: Gold targets. Shape: (batch, length). :param length_ratio: Length Ratios. Shape: (batch,). :param source_length: Source lengths. Shape: (batch,). :param target_length: Target lengths. Shape: (batch,). :return: Sequence scores. Shape: (batch,). """ logprobs = npx.log_softmax(logits, axis=-1, temperature=self.softmax_temperature) # Select the label probability, then take their logs. # probs and scores: (batch_size, target_seq_len) token_scores = npx.pick(logprobs, labels, axis=-1) if self.score_type == C.SCORING_TYPE_NEGLOGPROB: token_scores = token_scores * -1 # Sum, then apply length penalty. The call to `np.where` masks out invalid values from scores. # zeros and sums: (batch_size,) scores = np.sum(np.where(labels != 0, token_scores, np.zeros_like(token_scores)), axis=1) if self.constant_length_ratio is not None and self.constant_length_ratio > 0.0: predicted_output_length = source_length * self.constant_length_ratio else: predicted_output_length = source_length * length_ratio scores = self.scorer(scores, target_length, predicted_output_length) return scores
def forward(self, best_hyp_indices, best_word_indices, finished, scores_accumulated, lengths, reference_lengths, factors=None): # Reorder fixed-size beam data according to best_hyp_indices (ascending) finished = np.take(finished, best_hyp_indices, axis=0) lengths = np.take(lengths, best_hyp_indices, axis=0) reference_lengths = np.take(reference_lengths, best_hyp_indices, axis=0) # Normalize hypotheses that JUST finished all_finished = np.expand_dims(np.logical_or(best_word_indices == self.pad_id, best_word_indices == self.eos_id), axis=1) newly_finished = np.logical_xor(all_finished, finished) scores_accumulated = np.where(newly_finished, self._scorer(scores_accumulated, npx.cast(lengths, self.dtype), reference_lengths), scores_accumulated) # Recompute finished. Hypotheses are finished if they are extended with <pad> or <eos> finished = np.logical_or(best_word_indices == self.pad_id, best_word_indices == self.eos_id) finished = npx.cast(np.expand_dims(finished, axis=1), 'int32') # Concatenate sorted secondary target factors to best_word_indices. Shape: (batch*beam, num_factors) best_word_indices = np.expand_dims(best_word_indices, axis=1) if factors is not None: secondary_factors = np.take(factors, best_hyp_indices, axis=0) best_word_indices = np.concatenate((best_word_indices, secondary_factors), axis=1) return best_word_indices, finished, scores_accumulated, lengths, reference_lengths
def print_model_metrics(y, yhat, labels, title, stream=sys.stdout, wrong_preds=False): #Check if y is one hot encoded if (len(y.shape) != 1): y = y.argmax(axis=1) yhat = yhat.argmax(axis=1) print('\n' + title + '\n------------\n', file=stream) print("Classification Metrics\n----------\n", file=stream) print(met.classification_report(y, yhat, target_names=labels, zero_division=1), file=stream) print("Confusion Matrix\n----------\n", file=stream) print(met.confusion_matrix(y, yhat), file=stream) if (wrong_preds): print("Wrong Predictions\n----------\n", file=stream) mis_idx = np.where(y != yhat)[0] size = min(len(mis_idx), 10) wrong_preds = np.random.choice(mis_idx, size=size) for i in range(size): #print(df_test.iloc[wrong_preds[i]], file=stream) print('Original Label : {0}'.format(y[wrong_preds[i]]), file=stream) print('Predicted Label : {0}'.format(yhat[wrong_preds[i]]), file=stream) print('********************', file=stream)
def relative_position_bucket(relative_position, bidirectional: bool = True, num_buckets: int = 32, max_distance: int = 128): """Map the relative position to buckets. The implementation is consistent with that in [mesh_tensorflow](https://github.com/tensorflow/mesh/blob/c59988047e49b4d2af05603e3170724cdbadc467/mesh_tensorflow/transformer/transformer_layers.py#L595-L637) where relative position is defined as `mem_i - query_j`. Thus, a positive value indicates that the memory slot is in a later timestamp than the query slot. After handling the bidirectional case (see below), the implementation uses the first half of buckets to store exact differences and the second half to store the differences after a logrithmic transformation. Parameters ---------- relative_position Shape (...,) bidirectional Whether we are dealing with bidirectional attention. If it's bidirectional, positive shifts are mappd to [0, num_buckets // 2), and negative shifts are mapped to [num_buckets // 2, num_buckets). num_buckets The number of buckets. max_distance Maximum distance. Positions that fall outside of 'max_distance' will be trimmed. Returns ------- buckets Shape (...,). It has the same shape as the `relative_position`. It will have int32 type. """ ret = 0 relative_position = -relative_position if bidirectional: assert num_buckets % 2 == 0, 'When bidirectional is True, the number of buckets must be ' \ 'divisible by 2.' num_buckets //= 2 ret = ret + (relative_position < 0).astype(np.int32) * num_buckets relative_position = np.abs(relative_position) else: # Clip all the negative values to 0 relative_position = np.clip(relative_position, a_min=0, a_max=None) # Now, the relative_position is in the range [0, inf) # Half of the buckets deal with the exact increments, # i.e., 0, 1, 2, ..., max_exact - 1, where max_exact = num_buckets // 2 max_exact = num_buckets // 2 is_small = relative_position < max_exact # The other half of the buckets are for logarithmically bigger bins in positions up to # max_distance val_if_large = max_exact + ( np.log(relative_position.astype(np.float32) / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)).astype(np.int32) val_if_large = np.minimum(val_if_large, num_buckets - 1) ret = ret + np.where(is_small, relative_position, val_if_large) return ret
def masked_logsoftmax(att_score, mask, dtype=np.float32, axis: int = -1): """Ignore the masked elements when calculating the softmax. The mask can be broadcastable. Parameters ---------- att_score : Symborl or NDArray Shape (..., length, ...) mask : Symbol or NDArray or None Shape (..., length, ...) mask = 1 --> not masked mask = 0 --> masked dtype data type axis The axis to calculate the softmax. att_score.shape[axis] must be the same as mask.shape[axis] Returns ------- logits : Symborl or NDArray Shape (..., length, ...) The masked values will be all zero """ if mask is not None: # Fill in the masked scores with a very small value neg = -1e18 if _np.dtype(dtype) == np.float16: neg = -1e4 else: try: # if AMP (automatic mixed precision) is enabled, -1e18 will cause NaN. from mxnet.contrib import amp if amp.amp._amp_initialized: neg = -1e4 except ImportError: pass att_score = np.where(mask, att_score, neg) logits = np.where(mask, npx.log_softmax(att_score, axis=axis), -np.inf) else: logits = npx.log_softmax(att_score, axis=axis) return logits
def test_samplek_func(batch_size, beam_size, target_vocab_size, top_n): pytest.importorskip("mxnet") from mxnet import np import sockeye.beam_search # arrange scores increasing values from left to right, so the best item is always index 0, next-best 1, and so on scores = np.array([ list(range(1, target_vocab_size + 1)) for _ in range(batch_size * beam_size) ]) # normalize target_dists = scores / scores.sum(axis=1, keepdims=True) samplek = sockeye.beam_search.SampleK(n=top_n) samplek.initialize() sample_best_hyp_indices = np.arange(0, batch_size * beam_size, dtype='int32') # 0..(batch_size * beam_size)-1 expected_hyps = np.array(range(batch_size * beam_size), dtype='int32') finished = (np.random.uniform(0, 1, (batch_size * beam_size)) > 0.5).astype('int32') for i in [1, 2]: if i == 2: samplek.hybridize() hyps, words, values = samplek(scores, scores, finished, sample_best_hyp_indices) assert hyps.shape[0] == batch_size * beam_size # The indices should always be the integers from 0 to batch*beam-1 assert sum(hyps == expected_hyps).item() == (batch_size * beam_size) if top_n != 0: # Scores are increasing left-to-right, so best items are all the lowest word IDs. # No word id greater than the cap (top_n) should be selected assert np.sum(words >= top_n).item() == 0 # word index should be zero for all finished hypotheses assert np.sum(np.where(finished, words, finished)).item() == 0
def forward(self, inp): # pylint: disable=arguments-differ """ Parameters ---------- inp Shape (...,) Returns ------- out Shape (..., units) """ if self._div_val == 1.0: emb = np.take(getattr(self, 'embed0_weight').data(), inp, axis=0) if self._units != self._embed_size: emb = np.dot(emb, getattr(self, 'inter_proj0_weight').data()) else: emb = None for i, (l_idx, r_idx) in enumerate( zip([0] + self._cutoffs, self._cutoffs + [self._vocab_size])): emb_i = np.take(getattr(self, 'embed{}_weight'.format(i)).data(), inp - l_idx, axis=0, mode='clip') emb_i = np.dot( emb_i, getattr(self, 'inter_proj{}_weight'.format(i)).data()) if emb is None: emb = emb_i else: emb = np.where( np.expand_dims((inp >= l_idx) * (inp < r_idx), axis=-1), emb_i, emb) if self._scaled: emb = emb * self._emb_scale return emb
def dynamic_masking(self, input_ids, valid_lengths): # TODO(zheyuye), two additional flag `disallow_from_mask` and `already_masked` # that control the masking status for each positions in the sequence. """ Generate masking positions on-the-fly instead of during preprocessing Parameters ---------- input_ids The batchified input_ids with shape (batch_size, max_seq_length) valid_lengths The batchified valid_lengths with shape (batch_size, ) Returns ------ masked_input_ids The masked input sequence with 15% tokens are masked with [MASK] shape (batch_size, max_seq_length) length_masks The masking matrix for the whole sequence that indicates the positions are greater than valid_length. shape (batch_size, max_seq_length) unmasked_tokens The original tokens that appear in the unmasked input sequence shape (batch_size, num_masked_positions) masked_positions The masking positions in mx.np.ndarray with shape (batch_size, num_masked_positions) shape (batch_size, num_masked_positions) masked_lm_weights The weight matrix containing 0 or 1 to mark the actual effect of masked positions shape (batch_size, num_masked_positions) """ N = self._max_num_masked_position # Only valid token without special token are allowed to mask valid_candidates = np.ones_like(input_ids, dtype=np.bool) ignore_tokens = [ self.vocab.cls_id, self.vocab.sep_id, self.vocab.pad_id ] for ignore_token in ignore_tokens: # TODO(zheyuye), Update when operation += supported valid_candidates = valid_candidates * \ np.not_equal(input_ids, ignore_token) valid_lengths = valid_lengths.astype(np.float32) valid_candidates = valid_candidates.astype(np.float32) num_masked_position = mxnp.maximum( 1, np.minimum(N, round(valid_lengths * self._mask_prob))) # Get the masking probability of each position sample_probs = self._proposal_distribution * valid_candidates sample_probs /= mxnp.sum(sample_probs, axis=-1, keepdims=True) sample_probs = npx.stop_gradient(sample_probs) gumbels = mxnp.random.gumbel(np.zeros_like(sample_probs)) # Following the instruction of official repo to avoid deduplicate postions # with Top_k Sampling as https://github.com/google-research/electra/issues/41 masked_positions = npx.topk(mxnp.log(sample_probs) + gumbels, k=N, axis=-1, ret_typ='indices', dtype=np.int32) masked_weights = npx.sequence_mask(mxnp.ones_like(masked_positions), sequence_length=num_masked_position, use_sequence_length=True, axis=1, value=0) masked_positions = masked_positions * masked_weights length_masks = npx.sequence_mask(mxnp.ones_like(input_ids, dtype=np.float32), sequence_length=valid_lengths, use_sequence_length=True, axis=1, value=0) unmasked_tokens = select_vectors_by_position( input_ids, masked_positions) * masked_weights masked_weights = masked_weights.astype(np.float32) replaced_positions = (mxnp.random.uniform( mxnp.zeros_like(masked_positions), mxnp.ones_like( masked_positions)) < self._replace_prob) * masked_positions # dealing with multiple zero values in replaced_positions which causes # the [CLS] being replaced filled = mxnp.where(replaced_positions, self.vocab.mask_id, self.vocab.cls_id).astype(np.int32) # Masking token by replacing with [MASK] masked_input_ids = update_vectors_by_position(input_ids, filled, replaced_positions) # Note: It is likely have multiple zero values in masked_positions if number of masked of # positions not reached the maximum. However, this example hardly exists since valid_length # is almost always equal to max_seq_length masked_input = self.MaskedInput(input_ids=masked_input_ids, masks=length_masks, unmasked_tokens=unmasked_tokens, masked_positions=masked_positions, masked_weights=masked_weights) return masked_input