def forward(self, data, valid_length): """ Generate the representation given the inputs. This is used in training or fine-tuning a Bert model. Parameters ---------- F data - layout = 'NT' Shape (batch_size, seq_length, C) - layout = 'TN' Shape (seq_length, batch_size, C) valid_length : Shape (batch_size,) Returns ------- out - layout = 'NT' Shape (batch_size, seq_length, C_out) - layout = 'TN' Shape (seq_length, batch_size, C) """ # 1. Embed the data time_axis = 1 if self.layout == 'NT' else 0 attn_mask = gen_self_attn_mask(data, valid_length, dtype=self._dtype, attn_type='full', layout=self.layout) out = data all_encodings_outputs = [] additional_outputs = [] for layer_idx in range(self._num_layers): groups_id = layer_idx // self._num_layers_each_group layer = self.all_encoder_groups[groups_id] out, attention_weights = layer(out, attn_mask) # out : [batch_size, seq_len, units] # attention_weights : [batch_size, num_heads, seq_len, seq_len] if self._output_all_encodings: out = npx.sequence_mask(out, sequence_length=valid_length, use_sequence_length=True, axis=time_axis) all_encodings_outputs.append(out) if self._output_attention: additional_outputs.append(attention_weights) if not self._output_all_encodings: # if self._output_all_encodings, SequenceMask is already applied above out = npx.sequence_mask(out, sequence_length=valid_length, use_sequence_length=True, axis=time_axis) return out, additional_outputs else: return all_encodings_outputs, additional_outputs
def test_sequence_mask(): A = np.ones((2, 2, INT_OVERFLOW)) A.attach_grad() with mx.autograd.record(): B = npx.sequence_mask(A, sequence_length=np.array([1,1]), \ use_sequence_length=True) assert B.shape == (2, 2, INT_OVERFLOW) assert B[0][0][0] == 1 assert B[1][0][0] == 0 B.backward() assert A.grad.shape == (2, 2, INT_OVERFLOW) assert A.grad[0][0][0] == 1
def masked_softmax(X, valid_len): # ToDo : Why masked softmax is necessary? What is valid_len? # X: 3-D tensor, valid_len: 1-D or 2-D tensor if valid_len is None: return npx.softmax(X) else: shape = X.shape if valid_len.ndim == 1: valid_len = valid_len.repeat(shape[1], axis=0) else: valid_len = valid_len.reshape(-1) # Fill masked elements with a large negative, whose exp is 0 X = npx.sequence_mask(X.reshape(-1, shape[-1]), valid_len, True, axis=1, value=-1e6) return npx.softmax(X).reshape(shape)
def masked_softmax(X, valid_lens): """Perform softmax operation by masking elements on the last axis.""" # `X`: 3D tensor, `valid_lens`: 1D or 2D tensor if valid_lens is None: return npx.softmax(X) else: shape = X.shape if valid_lens.ndim == 1: valid_lens = valid_lens.repeat(shape[1]) else: valid_lens = valid_lens.reshape(-1) # On the last axis, replace masked elements with a very large negative # value, whose exponentiation outputs 0 X = npx.sequence_mask(X.reshape(-1, shape[-1]), valid_lens, True, value=-1e6, axis=1) return npx.softmax(X).reshape(shape)
def masked_softmax(X, valid_len): """Perform softmax by filtering out some elements.""" # X: 3-D tensor, valid_len: 1-D or 2-D tensor if valid_len is None: return npx.softmax(X) else: shape = X.shape if valid_len.ndim == 1: valid_len = valid_len.repeat(shape[1], axis=0) else: valid_len = valid_len.reshape(-1) # Fill masked elements with a large negative, whose exp is 0 X = npx.sequence_mask(X.reshape(-1, shape[-1]), valid_len, True, axis=1, value=-1e6) return npx.softmax(X).reshape(shape)
def forward(self, source_encoded: np.ndarray, source_encoded_length: np.ndarray) -> np.ndarray: """ Transformation to the length ratio. Returns a vector. :param source_encoded: Encoder representation for n elements. Shape: (n, source_encoded_length, hidden_size). :param source_encoded_length: A vector of encoded sequence lengths. Shape: (n,). :return: Predictions of the ratio length(hypothesis)/length(reference). Shape(n, 1). """ # source_masked: (n, source_encoded_length, hidden_size) source_masked = npx.sequence_mask( source_encoded, axis=1, sequence_length=source_encoded_length, use_sequence_length=True, value=0.) # calculate the proper means of encoded sources # data: (n, hidden_size) data = np.sum(source_masked, axis=1, keepdims=False) / np.reshape( source_encoded_length, (-1, 1)) # MLP. Shape: (n, 1) data = self.layers(data) # Shape: (n,) return np.squeeze(data)
def forward(self, pred, label, valid_len): # weights shape: (batch_size, seq_len, 1) weights = np.expand_dims(np.ones_like(label), axis=-1) weights = npx.sequence_mask(weights, valid_len, True, axis=1) return super(MaskedSoftmaxCELoss, self).forward(pred, label, weights)
def dynamic_masking(self, input_ids, valid_lengths): # TODO(zheyuye), two additional flag `disallow_from_mask` and `already_masked` # that control the masking status for each positions in the sequence. """ Generate masking positions on-the-fly instead of during preprocessing Parameters ---------- input_ids The batchified input_ids with shape (batch_size, max_seq_length) valid_lengths The batchified valid_lengths with shape (batch_size, ) Returns ------ masked_input_ids The masked input sequence with 15% tokens are masked with [MASK] shape (batch_size, max_seq_length) length_masks The masking matrix for the whole sequence that indicates the positions are greater than valid_length. shape (batch_size, max_seq_length) unmasked_tokens The original tokens that appear in the unmasked input sequence shape (batch_size, num_masked_positions) masked_positions The masking positions in mx.np.ndarray with shape (batch_size, num_masked_positions) shape (batch_size, num_masked_positions) masked_lm_weights The weight matrix containing 0 or 1 to mark the actual effect of masked positions shape (batch_size, num_masked_positions) """ N = self._max_num_masked_position # Only valid token without special token are allowed to mask valid_candidates = np.ones_like(input_ids, dtype=np.bool) ignore_tokens = [ self.vocab.cls_id, self.vocab.sep_id, self.vocab.pad_id ] for ignore_token in ignore_tokens: # TODO(zheyuye), Update when operation += supported valid_candidates = valid_candidates * \ np.not_equal(input_ids, ignore_token) valid_lengths = valid_lengths.astype(np.float32) valid_candidates = valid_candidates.astype(np.float32) num_masked_position = mxnp.maximum( 1, np.minimum(N, round(valid_lengths * self._mask_prob))) # Get the masking probability of each position sample_probs = self._proposal_distribution * valid_candidates sample_probs /= mxnp.sum(sample_probs, axis=-1, keepdims=True) sample_probs = npx.stop_gradient(sample_probs) gumbels = mxnp.random.gumbel(np.zeros_like(sample_probs)) # Following the instruction of official repo to avoid deduplicate postions # with Top_k Sampling as https://github.com/google-research/electra/issues/41 masked_positions = npx.topk(mxnp.log(sample_probs) + gumbels, k=N, axis=-1, ret_typ='indices', dtype=np.int32) masked_weights = npx.sequence_mask(mxnp.ones_like(masked_positions), sequence_length=num_masked_position, use_sequence_length=True, axis=1, value=0) masked_positions = masked_positions * masked_weights length_masks = npx.sequence_mask(mxnp.ones_like(input_ids, dtype=np.float32), sequence_length=valid_lengths, use_sequence_length=True, axis=1, value=0) unmasked_tokens = select_vectors_by_position( input_ids, masked_positions) * masked_weights masked_weights = masked_weights.astype(np.float32) replaced_positions = (mxnp.random.uniform( mxnp.zeros_like(masked_positions), mxnp.ones_like( masked_positions)) < self._replace_prob) * masked_positions # dealing with multiple zero values in replaced_positions which causes # the [CLS] being replaced filled = mxnp.where(replaced_positions, self.vocab.mask_id, self.vocab.cls_id).astype(np.int32) # Masking token by replacing with [MASK] masked_input_ids = update_vectors_by_position(input_ids, filled, replaced_positions) # Note: It is likely have multiple zero values in masked_positions if number of masked of # positions not reached the maximum. However, this example hardly exists since valid_length # is almost always equal to max_seq_length masked_input = self.MaskedInput(input_ids=masked_input_ids, masks=length_masks, unmasked_tokens=unmasked_tokens, masked_positions=masked_positions, masked_weights=masked_weights) return masked_input
def forward(self, data, valid_length): """ Generate the representation given the inputs. This is used in training or fine-tuning a mobile bert model. Parameters ---------- F data - layout = 'NT' Shape (batch_size, seq_length, C) - layout = 'TN' Shape (seq_length, batch_size, C) valid_length Shape (batch_size,) Returns ------- out - layout = 'NT' Shape (batch_size, seq_length, C_out) - layout = 'TN' Shape (seq_length, batch_size, C_out) """ if self._layout == 'NT': batch_axis, time_axis = 0, 1 elif self._layout == 'TN': batch_axis, time_axis = 1, 0 else: raise NotImplementedError( 'Received layout="{}". ' 'Only "NT" and "TN" are supported.'.format(self._layout)) # 1. Embed the data attn_mask = gen_self_attn_mask(data, valid_length, dtype=self._dtype, layout=self._layout, attn_type='full') out = data all_encodings_outputs = [] additional_outputs = [] all_encodings_outputs.append(out) for layer_idx in range(self._num_layers): layer = self.all_layers[layer_idx] out, attention_weights = layer(out, attn_mask) # out : [batch_size, seq_len, units] # attention_weights : [batch_size, num_heads, seq_len, seq_len] if self._output_all_encodings: out = npx.sequence_mask(out, sequence_length=valid_length, use_sequence_length=True, axis=time_axis) all_encodings_outputs.append(out) if self._output_attention: additional_outputs.append(attention_weights) if not self._output_all_encodings: # if self._output_all_encodings, SequenceMask is already applied above out = npx.sequence_mask(out, sequence_length=valid_length, use_sequence_length=True, axis=time_axis) return out, additional_outputs else: return all_encodings_outputs, additional_outputs