Esempio n. 1
0
def mask_at_indexes(
    tokenizer: PreTrainedTokenizer,
    ids: pt.LongTensor,
    tokens_mask: pt.BoolTensor,
    indexes: pt.LongTensor,
) -> pt.LongTensor:
    masked_token_ids = ids.masked_select(tokens_mask)
    masked_token_ids[indexes] = tokenizer.mask_token_id
    masked_ids = ids.masked_scatter(tokens_mask, masked_token_ids)
    masked_ids = cast(pt.LongTensor, masked_ids)
    return masked_ids
Esempio n. 2
0
    def prepare_batch(
        self, sequences: torch.LongTensor, lengths: torch.LongTensor
    ) -> Tuple[torch.LongTensor, torch.LongTensor, torch.LongTensor]:
        # compute the attention mask
        batch_size, max_seq_len = sequences.size()
        attention_mask = (torch.arange(
            max_seq_len, dtype=torch.long, device=lengths.device) <
                          lengths[:, None])

        # prepare the target
        target = sequences.clone().detach()

        # get the token probabilities of the sequences in the batch
        _token_probabilities = self.token_probabilities[sequences.view(-1)]

        # compute the number of targets (tokens for which a prediction needs to be made)
        num_targets = math.ceil(self.pred_proportion * lengths.sum().item())

        # compute the prediction mask
        target_idxs = torch.multinomial(_token_probabilities /
                                        _token_probabilities.sum(),
                                        num_targets,
                                        replacement=False)
        pred_mask = torch.zeros(batch_size * max_seq_len,
                                dtype=torch.bool,
                                device=sequences.device)
        pred_mask[target_idxs] = 1
        pred_mask = pred_mask.view(batch_size, max_seq_len)
        pred_mask[sequences ==
                  self.dataloader.dataset.special_tokens_map['pad_token']] = 0

        # compute the prediction tokens
        sequences_keep = sequences[pred_mask]
        sequences_rand = sequences_keep.clone().random_(
            self.dataloader.dataset._tokenizer.get_vocab_size())
        sequences_mask = sequences_keep.clone().fill_(
            self.dataloader.dataset.special_tokens_map['mask_token'])
        pred_idxs = torch.multinomial(self.pred_probabilities,
                                      len(sequences_keep),
                                      replacement=True)
        pred_tokens = sequences_mask * (pred_idxs == 0).long(
        ) + sequences_keep * (pred_idxs == 1).long() + sequences_rand * (
            pred_idxs == 2).long()

        # copy the prediction tokens into the sequences, given the prediction mask
        sequences = sequences.masked_scatter(pred_mask, pred_tokens)

        # ignore tokens that are not in the prediction mask
        target[~pred_mask] = -100

        return sequences, attention_mask, target