def featurize(self, tokenizer, feat_spec): unpadded_inputs = construct_single_input_tokens_and_segment_ids( input_tokens=self.text, tokenizer=tokenizer, feat_spec=feat_spec, ) input_set = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_inputs.unpadded_tokens, unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) # Replicate padding / additional tokens for the label ids and mask if feat_spec.sep_token_extra: label_suffix = [None, None] mask_suffix = [0, 0] special_tokens_count = 3 # CLS, SEP-SEP else: label_suffix = [None] mask_suffix = [0] special_tokens_count = 2 # CLS, SEP unpadded_labels = ( [None] + self.labels[:feat_spec.max_seq_length - special_tokens_count] + label_suffix) unpadded_labels = [i if i is not None else -1 for i in unpadded_labels] unpadded_label_mask = ( [0] + self.label_mask[:feat_spec.max_seq_length - special_tokens_count] + mask_suffix) padded_labels = pad_single_with_feat_spec( ls=unpadded_labels, feat_spec=feat_spec, pad_idx=-1, ) padded_label_mask = pad_single_with_feat_spec( ls=unpadded_label_mask, feat_spec=feat_spec, pad_idx=0, ) return DataRow( guid=self.guid, input_ids=np.array(input_set.input_ids), input_mask=np.array(input_set.input_mask), segment_ids=np.array(input_set.segment_ids), label_ids=np.array(padded_labels), label_mask=np.array(padded_label_mask), tokens=unpadded_inputs.unpadded_tokens, )
def featurize(self, tokenizer, feat_spec): # Handle masked_tokens unpadded_masked_inputs = construct_single_input_tokens_and_segment_ids( input_tokens=self.masked_tokens, tokenizer=tokenizer, feat_spec=feat_spec, ) masked_input_set = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_masked_inputs.unpadded_tokens, unpadded_segment_ids=unpadded_masked_inputs.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) # Handle label_tokens special_tokens_count = 2 # CLS, SEP pad_token = tokenizer.pad_token (unpadded_label_tokens, ) = truncate_sequences( tokens_ls=[self.label_tokens], max_length=feat_spec.max_seq_length - special_tokens_count, ) if feat_spec.cls_token_at_end: unpadded_label_tokens = unpadded_label_tokens + [ pad_token, pad_token ] else: unpadded_label_tokens = [pad_token ] + unpadded_label_tokens + [pad_token] unpadded_label_token_ids = tokenizer.convert_tokens_to_ids( unpadded_label_tokens) masked_lm_labels = pad_single_with_feat_spec( ls=unpadded_label_token_ids, feat_spec=feat_spec, pad_idx=feat_spec.pad_token_id, ) masked_lm_labels = np.array(masked_lm_labels) masked_lm_labels[masked_lm_labels == feat_spec. pad_token_id] = mlm_template.NON_MASKED_TOKEN_LABEL_ID return DataRow( guid=self.guid, masked_input_ids=np.array(masked_input_set.input_ids), input_mask=np.array(masked_input_set.input_mask), segment_ids=np.array(masked_input_set.segment_ids), masked_lm_labels=masked_lm_labels, masked_tokens=unpadded_masked_inputs.unpadded_tokens, label_tokens=unpadded_label_tokens, )
def featurize(self, tokenizer, feat_spec): unpadded_inputs = construct_single_input_tokens_and_segment_ids( input_tokens=self.text_tokens, tokenizer=tokenizer, feat_spec=feat_spec, ) input_set = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_inputs.unpadded_tokens, unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) return DataRow( guid=self.guid, input_ids=np.array(input_set.input_ids), input_mask=np.array(input_set.input_mask), segment_ids=np.array(input_set.segment_ids), is_english=self.is_english, tokens=unpadded_inputs.unpadded_tokens, )
def featurize(self, tokenizer, feat_spec): unpadded_inputs = construct_single_input_tokens_and_segment_ids( input_tokens=self.input_tokens, tokenizer=tokenizer, feat_spec=feat_spec, ) input_set = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_inputs.unpadded_tokens, unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) return DataRow( guid=self.guid, input_ids=np.array(input_set.input_ids), input_mask=np.array(input_set.input_mask), segment_ids=np.array(input_set.segment_ids), # Masking will be performed on the fly in train tokens=unpadded_inputs.unpadded_tokens, )