Exemple #1
0
    def featurize(self, tokenizer, feat_spec):
        unpadded_inputs = construct_single_input_tokens_and_segment_ids(
            input_tokens=self.text,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )
        input_set = create_input_set_from_tokens_and_segments(
            unpadded_tokens=unpadded_inputs.unpadded_tokens,
            unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )

        # Replicate padding / additional tokens for the label ids and mask
        if feat_spec.sep_token_extra:
            label_suffix = [None, None]
            mask_suffix = [0, 0]
            special_tokens_count = 3  # CLS, SEP-SEP
        else:
            label_suffix = [None]
            mask_suffix = [0]
            special_tokens_count = 2  # CLS, SEP
        unpadded_labels = (
            [None] +
            self.labels[:feat_spec.max_seq_length - special_tokens_count] +
            label_suffix)
        unpadded_labels = [i if i is not None else -1 for i in unpadded_labels]
        unpadded_label_mask = (
            [0] +
            self.label_mask[:feat_spec.max_seq_length - special_tokens_count] +
            mask_suffix)

        padded_labels = pad_single_with_feat_spec(
            ls=unpadded_labels,
            feat_spec=feat_spec,
            pad_idx=-1,
        )
        padded_label_mask = pad_single_with_feat_spec(
            ls=unpadded_label_mask,
            feat_spec=feat_spec,
            pad_idx=0,
        )

        return DataRow(
            guid=self.guid,
            input_ids=np.array(input_set.input_ids),
            input_mask=np.array(input_set.input_mask),
            segment_ids=np.array(input_set.segment_ids),
            label_ids=np.array(padded_labels),
            label_mask=np.array(padded_label_mask),
            tokens=unpadded_inputs.unpadded_tokens,
        )
Exemple #2
0
 def featurize(self, tokenizer, feat_spec):
     # Handle masked_tokens
     unpadded_masked_inputs = construct_single_input_tokens_and_segment_ids(
         input_tokens=self.masked_tokens,
         tokenizer=tokenizer,
         feat_spec=feat_spec,
     )
     masked_input_set = create_input_set_from_tokens_and_segments(
         unpadded_tokens=unpadded_masked_inputs.unpadded_tokens,
         unpadded_segment_ids=unpadded_masked_inputs.unpadded_segment_ids,
         tokenizer=tokenizer,
         feat_spec=feat_spec,
     )
     # Handle label_tokens
     special_tokens_count = 2  # CLS, SEP
     pad_token = tokenizer.pad_token
     (unpadded_label_tokens, ) = truncate_sequences(
         tokens_ls=[self.label_tokens],
         max_length=feat_spec.max_seq_length - special_tokens_count,
     )
     if feat_spec.cls_token_at_end:
         unpadded_label_tokens = unpadded_label_tokens + [
             pad_token, pad_token
         ]
     else:
         unpadded_label_tokens = [pad_token
                                  ] + unpadded_label_tokens + [pad_token]
     unpadded_label_token_ids = tokenizer.convert_tokens_to_ids(
         unpadded_label_tokens)
     masked_lm_labels = pad_single_with_feat_spec(
         ls=unpadded_label_token_ids,
         feat_spec=feat_spec,
         pad_idx=feat_spec.pad_token_id,
     )
     masked_lm_labels = np.array(masked_lm_labels)
     masked_lm_labels[masked_lm_labels == feat_spec.
                      pad_token_id] = mlm_template.NON_MASKED_TOKEN_LABEL_ID
     return DataRow(
         guid=self.guid,
         masked_input_ids=np.array(masked_input_set.input_ids),
         input_mask=np.array(masked_input_set.input_mask),
         segment_ids=np.array(masked_input_set.segment_ids),
         masked_lm_labels=masked_lm_labels,
         masked_tokens=unpadded_masked_inputs.unpadded_tokens,
         label_tokens=unpadded_label_tokens,
     )
Exemple #3
0
 def featurize(self, tokenizer, feat_spec):
     unpadded_inputs = construct_single_input_tokens_and_segment_ids(
         input_tokens=self.text_tokens, tokenizer=tokenizer, feat_spec=feat_spec,
     )
     input_set = create_input_set_from_tokens_and_segments(
         unpadded_tokens=unpadded_inputs.unpadded_tokens,
         unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
         tokenizer=tokenizer,
         feat_spec=feat_spec,
     )
     return DataRow(
         guid=self.guid,
         input_ids=np.array(input_set.input_ids),
         input_mask=np.array(input_set.input_mask),
         segment_ids=np.array(input_set.segment_ids),
         is_english=self.is_english,
         tokens=unpadded_inputs.unpadded_tokens,
     )
Exemple #4
0
 def featurize(self, tokenizer, feat_spec):
     unpadded_inputs = construct_single_input_tokens_and_segment_ids(
         input_tokens=self.input_tokens,
         tokenizer=tokenizer,
         feat_spec=feat_spec,
     )
     input_set = create_input_set_from_tokens_and_segments(
         unpadded_tokens=unpadded_inputs.unpadded_tokens,
         unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
         tokenizer=tokenizer,
         feat_spec=feat_spec,
     )
     return DataRow(
         guid=self.guid,
         input_ids=np.array(input_set.input_ids),
         input_mask=np.array(input_set.input_mask),
         segment_ids=np.array(input_set.segment_ids),
         # Masking will be performed on the fly in train
         tokens=unpadded_inputs.unpadded_tokens,
     )