def mask_at_indexes( tokenizer: PreTrainedTokenizer, ids: pt.LongTensor, tokens_mask: pt.BoolTensor, indexes: pt.LongTensor, ) -> pt.LongTensor: masked_token_ids = ids.masked_select(tokens_mask) masked_token_ids[indexes] = tokenizer.mask_token_id masked_ids = ids.masked_scatter(tokens_mask, masked_token_ids) masked_ids = cast(pt.LongTensor, masked_ids) return masked_ids
def prepare_batch( self, sequences: torch.LongTensor, lengths: torch.LongTensor ) -> Tuple[torch.LongTensor, torch.LongTensor, torch.LongTensor]: # compute the attention mask batch_size, max_seq_len = sequences.size() attention_mask = (torch.arange( max_seq_len, dtype=torch.long, device=lengths.device) < lengths[:, None]) # prepare the target target = sequences.clone().detach() # get the token probabilities of the sequences in the batch _token_probabilities = self.token_probabilities[sequences.view(-1)] # compute the number of targets (tokens for which a prediction needs to be made) num_targets = math.ceil(self.pred_proportion * lengths.sum().item()) # compute the prediction mask target_idxs = torch.multinomial(_token_probabilities / _token_probabilities.sum(), num_targets, replacement=False) pred_mask = torch.zeros(batch_size * max_seq_len, dtype=torch.bool, device=sequences.device) pred_mask[target_idxs] = 1 pred_mask = pred_mask.view(batch_size, max_seq_len) pred_mask[sequences == self.dataloader.dataset.special_tokens_map['pad_token']] = 0 # compute the prediction tokens sequences_keep = sequences[pred_mask] sequences_rand = sequences_keep.clone().random_( self.dataloader.dataset._tokenizer.get_vocab_size()) sequences_mask = sequences_keep.clone().fill_( self.dataloader.dataset.special_tokens_map['mask_token']) pred_idxs = torch.multinomial(self.pred_probabilities, len(sequences_keep), replacement=True) pred_tokens = sequences_mask * (pred_idxs == 0).long( ) + sequences_keep * (pred_idxs == 1).long() + sequences_rand * ( pred_idxs == 2).long() # copy the prediction tokens into the sequences, given the prediction mask sequences = sequences.masked_scatter(pred_mask, pred_tokens) # ignore tokens that are not in the prediction mask target[~pred_mask] = -100 return sequences, attention_mask, target