def __call__(self, instance, d_type): label, tokens_a, tokens_b = instance # -3 special tokens for [CLS] text_a [SEP] text_b [SEP] # -2 special tokens for [CLS] text_a [SEP] _max_len = self.max_len - 3 if tokens_b else self.max_len - 2 truncate_tokens_pair(tokens_a, tokens_b, _max_len) # Add Special Tokens tokens_a = ['[CLS]'] + tokens_a + ['[SEP]'] tokens_b = tokens_b + ['[SEP]'] if tokens_b else [] return (label, tokens_a, tokens_b)
def __call__(self, instance): is_not_next, tokens_a, tokens_b = instance # -3 for special tokens [CLS], [SEP], [SEP] truncate_tokens_pair(tokens_a, tokens_b, self.max_len - 3) # Add Special Tokens tokens = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b + ['[SEP]'] segment_ids = [0]*(len(tokens_a)+2) + [1]*(len(tokens_b)+1) input_mask = [1]*len(tokens) # For masked Language Models masked_tokens, masked_pos = [], [] # the number of prediction is sometimes less than max_pred when sequence is short n_pred = min(self.max_pred, max(1, int(round(len(tokens)*self.mask_prob)))) # candidate positions of masked tokens cand_pos = [i for i, token in enumerate(tokens) if token != '[CLS]' and token != '[SEP]'] shuffle(cand_pos) for pos in cand_pos[:n_pred]: masked_tokens.append(tokens[pos]) masked_pos.append(pos) if rand() < 0.8: # 80% tokens[pos] = '[MASK]' elif rand() < 0.5: # 10% tokens[pos] = get_random_word(self.vocab_words) # when n_pred < max_pred, we only calculate loss within n_pred masked_weights = [1]*len(masked_tokens) # Token Indexing input_ids = self.indexer(tokens) masked_ids = self.indexer(masked_tokens) # Zero Padding n_pad = self.max_len - len(input_ids) input_ids.extend([0]*n_pad) segment_ids.extend([0]*n_pad) input_mask.extend([0]*n_pad) # Zero Padding for masked target if self.max_pred > n_pred: n_pad = self.max_pred - n_pred masked_ids.extend([0]*n_pad) masked_pos.extend([0]*n_pad) masked_weights.extend([0]*n_pad) return (input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_not_next)