Esempio n. 1
0
    def _convert_examples_to_features(
        self,
        examples: List[TokenClsInputExample],
        max_seq_length,
        tokenizer,
        include_labels=True,
        cls_token_at_end=False,
        pad_on_left=False,
        cls_token="[CLS]",
        sep_token="[SEP]",
        pad_token=0,
        sequence_segment_id=0,
        sep_token_extra=0,
        cls_token_segment_id=1,
        pad_token_segment_id=0,
        mask_padding_with_zero=True,
    ):
        """ Loads a data file into a list of `InputBatch`s
            `cls_token_at_end` define the location of the CLS token:
                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
            `cls_token_segment_id` define the segment id associated to the CLS token
            (0 for BERT, 2 for XLNet)
        """

        if include_labels:
            label_map = {v: k for k, v in self.labels_id_map.items()}
            label_pad = 0

        features = []
        for (ex_index, example) in enumerate(examples):
            if ex_index % 10000 == 0:
                logger.info("Processing example %d of %d", ex_index,
                            len(examples))

            tokens = []
            labels = []
            valid_tokens = []
            for i, token in enumerate(example.tokens):
                new_tokens = tokenizer.tokenize(token)
                tokens.extend(new_tokens)
                v_tok = [0] * (len(new_tokens))
                v_tok[0] = 1
                valid_tokens.extend(v_tok)
                if include_labels:
                    v_lbl = [label_pad] * (len(new_tokens))
                    v_lbl[0] = label_map.get(example.label[i])
                    labels.extend(v_lbl)

            # truncate by max_seq_length
            special_tokens_count = 3 if sep_token_extra else 2
            tokens = tokens[:(max_seq_length - special_tokens_count)]
            valid_tokens = valid_tokens[:(max_seq_length -
                                          special_tokens_count)]
            if include_labels:
                labels = labels[:(max_seq_length - special_tokens_count)]

            tokens += [sep_token]
            if include_labels:
                labels += [label_pad]
            valid_tokens += [0]
            if sep_token_extra:  # roberta special case
                tokens += [sep_token]
                valid_tokens += [0]
                if include_labels:
                    labels += [label_pad]
            segment_ids = [sequence_segment_id] * len(tokens)

            if cls_token_at_end:
                tokens = tokens + [cls_token]
                segment_ids = segment_ids + [cls_token_segment_id]
                if include_labels:
                    labels = labels + [label_pad]
                valid_tokens = valid_tokens + [0]
            else:
                tokens = [cls_token] + tokens
                segment_ids = [cls_token_segment_id] + segment_ids
                if include_labels:
                    labels = [label_pad] + labels
                valid_tokens = [0] + valid_tokens

            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding_length = max_seq_length - len(input_ids)
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                input_mask = ([0 if mask_padding_with_zero else 1] *
                              padding_length) + input_mask
                segment_ids = ([pad_token_segment_id] *
                               padding_length) + segment_ids
                if include_labels:
                    labels = ([label_pad] * padding_length) + labels
                valid_tokens = ([0] * padding_length) + valid_tokens
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
                input_mask = input_mask + (
                    [0 if mask_padding_with_zero else 1] * padding_length)
                segment_ids = segment_ids + ([pad_token_segment_id] *
                                             padding_length)
                if include_labels:
                    labels = labels + ([label_pad] * padding_length)
                valid_tokens = valid_tokens + ([0] * padding_length)

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length
            assert len(valid_tokens) == max_seq_length
            if include_labels:
                assert len(labels) == max_seq_length

            features.append(
                InputFeatures(
                    input_ids=input_ids,
                    input_mask=input_mask,
                    segment_ids=segment_ids,
                    label_id=labels,
                    valid_ids=valid_tokens,
                ))
        return features
Esempio n. 2
0
    def _convert_examples_to_features(self,
                                      examples,
                                      max_seq_length,
                                      tokenizer,
                                      task_type,
                                      include_labels=True,
                                      cls_token_at_end=False,
                                      pad_on_left=False,
                                      cls_token='[CLS]',
                                      sep_token='[SEP]',
                                      pad_token=0,
                                      sequence_a_segment_id=0,
                                      sequence_b_segment_id=1,
                                      cls_token_segment_id=1,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
        """ Loads a data file into a list of `InputBatch`s
            `cls_token_at_end` define the location of the CLS token:
                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
            `cls_token_segment_id` define the segment id associated to the CLS token
            (0 for BERT, 2 for XLNet)
        """

        if include_labels:
            label_map = {label: i for i, label in enumerate(self.labels)}

        features = []
        for (ex_index, example) in enumerate(examples):
            if ex_index % 10000 == 0:
                logger.info("Writing example %d of %d" %
                            (ex_index, len(examples)))

            tokens_a = tokenizer.tokenize(example.text)

            tokens_b = None
            if example.text_b:
                tokens_b = tokenizer.tokenize(example.text_b)
                # Modifies `tokens_a` and `tokens_b` in place so that the total
                # length is less than the specified length.
                # Account for [CLS], [SEP], [SEP] with "- 3"
                _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
            else:
                # Account for [CLS] and [SEP] with "- 2"
                if len(tokens_a) > max_seq_length - 2:
                    tokens_a = tokens_a[:(max_seq_length - 2)]

            # The convention in BERT is:
            # (a) For sequence pairs:
            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
            #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
            # (b) For single sequences:
            #  tokens:   [CLS] the dog is hairy . [SEP]
            #  type_ids:   0   0   0   0  0     0   0
            #
            # Where "type_ids" are used to indicate whether this is the first
            # sequence or the second sequence. The embedding vectors for `type=0` and
            # `type=1` were learned during pre-training and are added to the wordpiece
            # embedding vector (and position vector). This is not *strictly* necessary
            # since the [SEP] token unambiguously separates the sequences, but it makes
            # it easier for the model to learn the concept of sequences.
            #
            # For classification tasks, the first vector (corresponding to [CLS]) is
            # used as as the "sentence vector". Note that this only makes sense because
            # the entire model is fine-tuned.
            tokens = tokens_a + [sep_token]
            segment_ids = [sequence_a_segment_id] * len(tokens)

            if tokens_b:
                tokens += tokens_b + [sep_token]
                segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)

            if cls_token_at_end:
                tokens = tokens + [cls_token]
                segment_ids = segment_ids + [cls_token_segment_id]
            else:
                tokens = [cls_token] + tokens
                segment_ids = [cls_token_segment_id] + segment_ids

            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding_length = max_seq_length - len(input_ids)
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                input_mask = ([0 if mask_padding_with_zero else 1] *
                              padding_length) + input_mask
                segment_ids = ([pad_token_segment_id] *
                               padding_length) + segment_ids
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
                input_mask = input_mask + (
                    [0 if mask_padding_with_zero else 1] * padding_length)
                segment_ids = segment_ids + ([pad_token_segment_id] *
                                             padding_length)

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length

            if include_labels:
                if task_type == "classification":
                    label_id = label_map[example.label]
                elif task_type == "regression":
                    label_id = float(example.label)
                else:
                    raise KeyError(task_type)
            else:
                label_id = None

            features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
        return features
Esempio n. 3
0
    def _convert_examples_to_features(
        self,
        examples,
        max_seq_length,
        tokenizer,
        task_type,
        include_labels=True,
        pad_on_left=False,
        pad_token=0,
        pad_token_segment_id=0,
        mask_padding_with_zero=True,
    ):
        """ Loads a data file into a list of `InputBatch`s
            `cls_token_at_end` define the location of the CLS token:
                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
            `cls_token_segment_id` define the segment id associated to the CLS token
            (0 for BERT, 2 for XLNet)
        """

        if include_labels:
            label_map = {label: i for i, label in enumerate(self.labels)}

        features = []
        for (ex_index, example) in enumerate(examples):
            if ex_index % 10000 == 0:
                logger.info("Writing example %d of %d", ex_index,
                            len(examples))

            inputs = tokenizer.encode_plus(
                example.text,
                example.text_b,
                add_special_tokens=True,
                max_length=max_seq_length,
            )
            input_ids, token_type_ids = inputs["input_ids"], inputs[
                "token_type_ids"]

            attention_mask = [1 if mask_padding_with_zero else 0
                              ] * len(input_ids)

            padding_length = max_seq_length - len(input_ids)
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                attention_mask = ([0 if mask_padding_with_zero else 1] *
                                  padding_length) + attention_mask
                token_type_ids = ([pad_token_segment_id] *
                                  padding_length) + token_type_ids
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
                attention_mask = attention_mask + (
                    [0 if mask_padding_with_zero else 1] * padding_length)
                token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                                   padding_length)

            assert len(input_ids) == max_seq_length
            assert len(attention_mask) == max_seq_length
            assert len(token_type_ids) == max_seq_length

            if include_labels:
                if task_type == "classification":
                    label_id = label_map[example.label]
                elif task_type == "regression":
                    label_id = float(example.label)
                else:
                    raise KeyError(task_type)
            else:
                label_id = None

            features.append(
                InputFeatures(
                    input_ids=input_ids,
                    input_mask=attention_mask,
                    segment_ids=token_type_ids,
                    label_id=label_id,
                ))
        return features
Esempio n. 4
0
    def _convert_examples_to_features(self,
                                      examples: List[TokenClsInputExample],
                                      max_seq_length,
                                      tokenizer,
                                      include_labels=True,
                                      cls_token_at_end=False,
                                      pad_on_left=False,
                                      cls_token='[CLS]',
                                      sep_token='[SEP]',
                                      pad_token=0,
                                      sequence_segment_id=0,
                                      cls_token_segment_id=1,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
        """ Loads a data file into a list of `InputBatch`s
            `cls_token_at_end` define the location of the CLS token:
                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
            `cls_token_segment_id` define the segment id associated to the CLS token
            (0 for BERT, 2 for XLNet)
        """

        if include_labels:
            label_map = {v: k for k, v in self.labels_id_map.items()}
            label_pad = 0

        features = []
        for (ex_index, example) in enumerate(examples):
            if ex_index % 10000 == 0:
                logger.info("Processing example %d of %d" %
                            (ex_index, len(examples)))

            tokens = []
            labels = []
            valid_tokens = []
            for i, token in enumerate(example.tokens):
                new_tokens = tokenizer.tokenize(token)
                tokens.extend(new_tokens)
                v_tok = [0] * (len(new_tokens))
                v_tok[0] = 1
                valid_tokens.extend(v_tok)
                if include_labels:
                    v_lbl = [label_pad] * (len(new_tokens))
                    v_lbl[0] = label_map.get(example.label[i])
                    labels.extend(v_lbl)

            # truncate by max_seq_length
            tokens = tokens[:(max_seq_length - 2)]
            if include_labels:
                labels = labels[:(max_seq_length - 2)]
            valid_tokens = valid_tokens[:(max_seq_length - 2)]

            # The convention in BERT is:
            # (a) For sequence pairs:
            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
            #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
            # (b) For single sequences:
            #  tokens:   [CLS] the dog is hairy . [SEP]
            #  type_ids:   0   0   0   0  0     0   0
            #
            # Where "type_ids" are used to indicate whether this is the first
            # sequence or the second sequence. The embedding vectors for `type=0` and
            # `type=1` were learned during pre-training and are added to the wordpiece
            # embedding vector (and position vector). This is not *strictly* necessary
            # since the [SEP] token unambiguously separates the sequences, but it makes
            # it easier for the model to learn the concept of sequences.
            #
            # For classification tasks, the first vector (corresponding to [CLS]) is
            # used as as the "sentence vector". Note that this only makes sense because
            # the entire model is fine-tuned.
            tokens = tokens + [sep_token]
            if include_labels:
                labels = labels + [label_pad]
            valid_tokens = valid_tokens + [0]
            segment_ids = [sequence_segment_id] * len(tokens)

            if cls_token_at_end:
                tokens = tokens + [cls_token]
                segment_ids = segment_ids + [cls_token_segment_id]
                if include_labels:
                    labels = labels + [label_pad]
                valid_tokens = valid_tokens + [0]
            else:
                tokens = [cls_token] + tokens
                segment_ids = [cls_token_segment_id] + segment_ids
                if include_labels:
                    labels = [label_pad] + labels
                valid_tokens = [0] + valid_tokens

            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding_length = max_seq_length - len(input_ids)
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                input_mask = ([0 if mask_padding_with_zero else 1] *
                              padding_length) + input_mask
                segment_ids = ([pad_token_segment_id] *
                               padding_length) + segment_ids
                if include_labels:
                    labels = ([label_pad] * padding_length) + labels
                valid_tokens = ([0] * padding_length) + valid_tokens
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
                input_mask = input_mask + (
                    [0 if mask_padding_with_zero else 1] * padding_length)
                segment_ids = segment_ids + ([pad_token_segment_id] *
                                             padding_length)
                if include_labels:
                    labels = labels + ([label_pad] * padding_length)
                valid_tokens = valid_tokens + ([0] * padding_length)

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length
            assert len(valid_tokens) == max_seq_length
            if include_labels:
                assert len(labels) == max_seq_length

            features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=labels,
                              valid_ids=valid_tokens))
        return features