def convert_example_to_features(self, tokens_a, tokens_b, is_next_label, max_pos, short_seq_prob=0.1, masked_lm_prob=0.15): """ Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with IDs, LM labels, input_mask, CLS and SEP tokens etc. :param tokens_a: str, example tokens. :param tokens_b: str, example next tokens. :param is_next_label: int, is next label. :param max_pos: int, maximum length of sequence. :param short_seq_prob: float, Probability of creating sequences which are shorter than the maximum length. :param masked_lm_prob: float, Masked LM probability. :return: features """ target_max_pos = max_pos - 3 if tokens_b else max_pos - 2 tokens_a_ids = copy.copy(tokens_a) tokens_b_ids = copy.copy(tokens_b) # However, sequences to minimize the mismatch between pre-training and fine-tuning. if random() < short_seq_prob: target_max_pos = randint(2, target_max_pos - 1) truncate_seq_pair(tokens_a_ids, tokens_b_ids, target_max_pos) # Add Special Tokens tokens_a_ids.insert(0, self.cls_id) tokens_a_ids.append(self.sep_id) if len(tokens_b_ids) != 0: tokens_b_ids.append(self.sep_id) else: tokens_b_ids = [] tokens = copy.copy(tokens_a_ids) tokens.extend(copy.copy(tokens_b_ids)) # Add next sentence segment segment_ids = [0] * len(tokens_a_ids) + [1] * len(tokens_b_ids) lm_label_ids = [self.pad_id] * len(tokens_a_ids) + [self.pad_id] * len( tokens_b_ids) # mask prediction calc mask_prediction = int(round(len(tokens) * masked_lm_prob)) mask_candidate_pos = [ i for i, token in enumerate(tokens) if token != self.cls_id and token != self.sep_id ] # masked and random token shuffle(mask_candidate_pos) for pos in mask_candidate_pos[:mask_prediction]: if random() < 0.8: # 80% # masked lm_label_ids[pos] = tokens[pos] tokens[pos] = self.mask_id elif random() < 0.5: # 10% # random token lm_label_ids[pos] = tokens[pos] tokens[pos] = self.get_random_token_id() else: # 10% not mask and not modify lm_label_ids[pos] = tokens[pos] input_ids = tokens input_mask = [1] * len(input_ids) # zero padding num_zero_pad = max_pos - len(input_ids) input_ids.extend([self.pad_id] * num_zero_pad) segment_ids.extend([0] * num_zero_pad) input_mask.extend([0] * num_zero_pad) lm_label_ids.extend([self.pad_id] * num_zero_pad) return [ input_ids, segment_ids, input_mask, is_next_label, lm_label_ids ]
def convert_example_to_features(self, token_ids, max_pos, short_seq_prob=0.1, masked_lm_prob=0.15): """ Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with IDs, LM labels, input_mask, CLS and SEP tokens etc. :param token_ids: str, example tokens. :param max_pos: int, maximum length of sequence. :param short_seq_prob: float, Probability of creating sequences which are shorter than the maximum length. :param masked_lm_prob: float, Masked LM probability. :return: features """ target_max_pos = max_pos - self.bert_ids_num if random() < short_seq_prob: target_max_pos = randint(2, target_max_pos - 1) if self.is_sop: split_ids = copy.copy(token_ids) if len(split_ids) > 2: split_id = randint(1, len(split_ids) - 1) tokens_a = [] for i in range(split_id): tokens_a.append(split_ids[i]) tokens_b = [] for i in range(split_id, len(split_ids)): tokens_b.append(split_ids[i]) else: tokens_a = split_ids tokens_b = [] if len(tokens_b) > 0 and random() < 0.5: is_random_next = 1 temp = tokens_a tokens_a = tokens_b tokens_b = temp else: is_random_next = 0 tokens_a = list(itertools.chain.from_iterable(tokens_a)) tokens_b = list(itertools.chain.from_iterable(tokens_b)) truncate_seq_pair(tokens_a, tokens_b, target_max_pos) # Add Special Tokens tokens_a.insert(0, self.cls_id) tokens_a.append(self.sep_id) if len(tokens_b) > 0: tokens_b.append(self.sep_id) else: tokens_b = [] lm_label_ids = [self.pad_id] * len(tokens_a + tokens_b) bert_token_ids = copy.copy(tokens_a) bert_token_ids.extend(copy.copy(tokens_b)) # Add next sentence segment segment_ids = [0] * len(tokens_a) + [1] * len(tokens_b) else: label_ids = copy.copy(token_ids) truncate_seq_pair(label_ids, [], target_max_pos) # Add Special Tokens label_ids = [self.cls_id] + label_ids + [self.sep_id] lm_label_ids = [self.pad_id] * len(label_ids) bert_token_ids = copy.copy(label_ids) segment_ids = [0] * len(label_ids) is_random_next = 0 # mask prediction calc mask_prediction = round( (len(bert_token_ids) - self.bert_ids_num) * masked_lm_prob) mask_candidate_pos = [ i for i, token in enumerate(bert_token_ids) if token != self.cls_id and token != self.sep_id ] mask_length = np.random.geometric(0.4) if mask_length > self.max_words_length: mask_length = self.max_words_length if mask_length > mask_prediction: mask_length = mask_prediction if mask_length > 0: mask_candidate_words = np.array_split( mask_candidate_pos, int(len(mask_candidate_pos) / mask_length)) shuffle(mask_candidate_words) masked = 0 # masked and random token for words in mask_candidate_words: for pos in words: masked += 1 if random() < 0.8: # 80% # masked lm_label_ids[pos] = bert_token_ids[pos] bert_token_ids[pos] = self.mask_id elif random() < 0.5: # 10% # random token lm_label_ids[pos] = bert_token_ids[pos] bert_token_ids[pos] = self.get_random_token_id() else: # 10% not mask and not modify lm_label_ids[pos] = bert_token_ids[pos] if masked == mask_prediction: break if masked == mask_prediction: break input_ids = bert_token_ids # zero padding num_zero_pad = self.max_pos - len(input_ids) input_mask = [1] * len(input_ids) if segment_ids: segment_ids.extend([0] * num_zero_pad) input_mask.extend([0] * num_zero_pad) input_ids.extend([self.pad_id] * num_zero_pad) lm_label_ids.extend([self.pad_id] * num_zero_pad) return [ input_ids, segment_ids, input_mask, is_random_next, lm_label_ids ]