def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=True): """ Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with IDs, LM labels, padding_mask, CLS and SEP tokens etc. :param sample: Sample, containing sentence input as strings and is_next label :type sample: Sample :param max_seq_len: Maximum length of sequence. :type max_seq_len: int :param tokenizer: Tokenizer :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training) """ if next_sent_pred: tokens_a = sample.tokenized["text_a"]["tokens"] tokens_b = sample.tokenized["text_b"]["tokens"] # mask random words tokens_a, t1_label = mask_random_words( tokens_a, tokenizer.vocab, token_groups=sample.tokenized["text_a"]["start_of_word"]) tokens_b, t2_label = mask_random_words( tokens_b, tokenizer.vocab, token_groups=sample.tokenized["text_b"]["start_of_word"]) # convert lm labels to ids t1_label_ids = [ -1 if tok == '' else tokenizer.vocab[tok] for tok in t1_label ] t2_label_ids = [ -1 if tok == '' else tokenizer.vocab[tok] for tok in t2_label ] lm_label_ids = t1_label_ids + t2_label_ids # Convert is_next_label: Note that in Bert, is_next_labelid = 0 is used for next_sentence=true! if sample.clear_text["nextsentence_label"]: is_next_label_id = [0] else: is_next_label_id = [1] else: tokens_a = sample.tokenized["text_a"]["tokens"] tokens_b = None tokens_a, t1_label = mask_random_words( tokens_a, tokenizer.vocab, token_groups=sample.tokenized["text_a"]["start_of_word"]) # convert lm labels to ids lm_label_ids = [ -1 if tok == '' else tokenizer.vocab[tok] for tok in t1_label ] # encode string tokens to input_ids and add special tokens inputs = tokenizer.encode_plus(text=tokens_a, text_pair=tokens_b, add_special_tokens=True, max_length=max_seq_len, truncation_strategy='do_not_truncate' # We've already truncated our tokens before ) input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs[ "token_type_ids"], inputs["special_tokens_mask"] # account for special tokens (CLS, SEP, SEP..) in lm_label_ids lm_label_ids = insert_at_special_tokens_pos(lm_label_ids, special_tokens_mask, insert_element=-1) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. padding_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. # Padding up to the sequence length. # Normal case: adding multiple 0 to the right # Special cases: # a) xlnet pads on the left and uses "4" for padding token_type_ids if tokenizer.__class__.__name__ == "XLNetTokenizer": pad_on_left = True segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left) else: pad_on_left = False segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left) input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left) padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left) lm_label_ids = pad(lm_label_ids, max_seq_len, -1, pad_on_left=pad_on_left) feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "lm_label_ids": lm_label_ids, } if next_sent_pred: feature_dict["nextsentence_label_ids"] = is_next_label_id assert len(input_ids) == max_seq_len assert len(padding_mask) == max_seq_len assert len(segment_ids) == max_seq_len assert len(lm_label_ids) == max_seq_len return [feature_dict]
def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=True): """ Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with IDs, LM labels, padding_mask, CLS and SEP tokens etc. :param sample: Sample, containing sentence input as strings and is_next label :type sample: Sample :param max_seq_len: Maximum length of sequence. :type max_seq_len: int :param tokenizer: Tokenizer :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training) """ if next_sent_pred: tokens_a = sample.tokenized["text_a"]["tokens"] tokens_b = sample.tokenized["text_b"]["tokens"] # mask random words tokens_a, t1_label = mask_random_words(tokens_a, tokenizer.vocab, token_groups=sample.tokenized["text_a"]["start_of_word"]) tokens_b, t2_label = mask_random_words(tokens_b, tokenizer.vocab, token_groups=sample.tokenized["text_b"]["start_of_word"]) if tokenizer.is_fast: # Detokenize input as fast tokenizer can't handle tokenized input tokens_a = " ".join(tokens_a) tokens_a = re.sub(r"(^|\s)(##)", "", tokens_a) tokens_b = " ".join(tokens_b) tokens_b = re.sub(r"(^|\s)(##)", "", tokens_b) # convert lm labels to ids t1_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t1_label] t2_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t2_label] lm_label_ids = t1_label_ids + t2_label_ids # Convert is_next_label: Note that in Bert, is_next_labelid = 0 is used for next_sentence=true! if sample.clear_text["nextsentence_label"]: is_next_label_id = [0] else: is_next_label_id = [1] else: tokens_a = sample.tokenized["text_a"]["tokens"] tokens_b = None tokens_a, t1_label = mask_random_words(tokens_a, tokenizer.vocab, token_groups=sample.tokenized["text_a"]["start_of_word"]) if tokenizer.is_fast: # Detokenize input as fast tokenizer can't handle tokenized input tokens_a = " ".join(tokens_a) tokens_a = re.sub(r"(^|\s)(##)", "", tokens_a) # convert lm labels to ids lm_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t1_label] if tokenizer.is_fast: inputs = tokenizer(text=tokens_a, text_pair=tokens_b, add_special_tokens=True, return_special_tokens_mask=True, return_token_type_ids=True) seq_b_len = len(sample.tokenized["text_b"]["tokens"]) if "text_b" in sample.tokenized else 0 if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != \ (len(sample.tokenized["text_a"]["tokens"]) + seq_b_len): logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to " f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs " f"from number of tokens produced in tokenize_with_metadata(). \n" f"Further processing is likely to be wrong.") else: # encode string tokens to input_ids and add special tokens inputs = tokenizer.encode_plus(text=tokens_a, text_pair=tokens_b, add_special_tokens=True, truncation=False, truncation_strategy='do_not_truncate', # We've already truncated our tokens before return_special_tokens_mask=True, return_token_type_ids=True ) input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs["token_type_ids"], inputs[ "special_tokens_mask"] # account for special tokens (CLS, SEP, SEP..) in lm_label_ids lm_label_ids = insert_at_special_tokens_pos(lm_label_ids, special_tokens_mask, insert_element=-1) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. padding_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. # Padding up to the sequence length. # Normal case: adding multiple 0 to the right # Special cases: # a) xlnet pads on the left and uses "4" for padding token_type_ids if tokenizer.__class__.__name__ == "XLNetTokenizer": pad_on_left = True segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left) else: pad_on_left = False segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left) input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left) padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left) lm_label_ids = pad(lm_label_ids, max_seq_len, -1, pad_on_left=pad_on_left) feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "lm_label_ids": lm_label_ids, } if next_sent_pred: feature_dict["nextsentence_label_ids"] = is_next_label_id assert len(input_ids) == max_seq_len assert len(padding_mask) == max_seq_len assert len(segment_ids) == max_seq_len assert len(lm_label_ids) == max_seq_len return [feature_dict]
def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=True): """ Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with IDs, LM labels, padding_mask, CLS and SEP tokens etc. :param sample: Sample, containing sentence input as strings and is_next label :type sample: Sample :param max_seq_len: Maximum length of sequence. :type max_seq_len: int :param tokenizer: Tokenizer :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training) """ tokens_a = sample.tokenized["text_a"]["tokens"] tokens_b = sample.tokenized["text_b"]["tokens"] # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] or [CLS], [SEP] if not next_sent_pred: n_special_tokens = 2 else: n_special_tokens = 3 truncate_seq_pair(tokens_a, tokens_b, max_seq_len - n_special_tokens) tokens_a, t1_label = mask_random_words( tokens_a, tokenizer.vocab, token_groups=sample.tokenized["text_a"]["start_of_word"]) # convert lm labels to ids t1_label_ids = [ -1 if tok == '' else tokenizer.vocab[tok] for tok in t1_label ] if next_sent_pred: tokens_b, t2_label = mask_random_words( tokens_b, tokenizer.vocab, token_groups=sample.tokenized["text_b"]["start_of_word"]) t2_label_ids = [ -1 if tok == '' else tokenizer.vocab[tok] for tok in t2_label ] # concatenate lm labels and account for CLS, SEP, SEP lm_label_ids = [-1] + t1_label_ids + [-1] + t2_label_ids + [-1] else: lm_label_ids = [-1] + t1_label_ids + [-1] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambigiously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if next_sent_pred: assert len(tokens_b) > 0 for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. padding_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_len: input_ids.append(0) padding_mask.append(0) segment_ids.append(0) lm_label_ids.append(-1) # Convert is_next_label: Note that in Bert, is_next_labelid = 0 is used for next_sentence=true! if next_sent_pred: if sample.clear_text["nextsentence_label"]: is_next_label_id = [0] else: is_next_label_id = [1] assert len(input_ids) == max_seq_len assert len(padding_mask) == max_seq_len assert len(segment_ids) == max_seq_len assert len(lm_label_ids) == max_seq_len feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "lm_label_ids": lm_label_ids, } if next_sent_pred: feature_dict["nextsentence_label_ids"] = is_next_label_id return [feature_dict]