def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf``
    (if installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        import torch

        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)
Exemple #2
0
import unittest
import sys, os
import pdb

sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../src/")
from file_utils import is_tf_available
from testing_utils import require_tf
import json

if is_tf_available():

    import datetime, pickle, codecs, re, string
    from tqdm import tqdm
    import tensorflow as tf
    import pandas as pd
    import numpy as np
    import string
    from preprocess.utils import (
        Params,
        get_dataset,
        fix_fn,
        _py_fn,
        load_subword_embedding,
        normalize,
    )
    from models.hierarchical_attention.han import HAN_Model


@require_tf
class TestHANLoader(unittest.TestCase):
    @classmethod
Exemple #3
0
def glue_convert_examples_to_features(examples,
                                      tokenizer,
                                      max_length=512,
                                      task=None,
                                      label_list=None,
                                      output_mode=None,
                                      pad_on_left=False,
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
    """
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """
    is_tf_dataset = False
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        is_tf_dataset = True

    if task is not None:
        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" %
                        (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))
        if is_tf_dataset:
            example = processor.get_example_from_tensor_dict(example)

        inputs = tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
        )
        inputs2 = tokenizer.encode_plus(
            example.text_a,
            example.text_c,
            add_special_tokens=True,
            max_length=max_length,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs[
            "token_type_ids"]
        input_ids2, token_type_ids2 = inputs2["input_ids"], inputs2[
            "token_type_ids"]

        text_a_len = token_type_ids.count(0)
        text_b_len = len(token_type_ids) - text_a_len
        text_c_len = len(token_type_ids2) - text_a_len
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
        attention_mask2 = [1 if mask_padding_with_zero else 0
                           ] * len(input_ids2)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        padding_length2 = max_length - len(input_ids2)

        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            input_ids2 = ([pad_token] * padding_length2) + input_ids2

            attention_mask = ([0 if mask_padding_with_zero else 1] *
                              padding_length) + attention_mask
            attention_mask2 = ([0 if mask_padding_with_zero else 1] *
                               padding_length2) + attention_mask2

            token_type_ids = ([pad_token_segment_id] *
                              padding_length) + token_type_ids
            token_type_ids2 = ([pad_token_segment_id] *
                               padding_length2) + token_type_ids2
            """
            生成对齐Attention
            p a b
            a
            b
            """
            align_mask =[[0 if mask_padding_with_zero else 1] *len(input_ids)]*padding_length\
                        +[[0 if mask_padding_with_zero else 1]*(padding_length+text_a_len)+[1 if mask_padding_with_zero else 0]*text_b_len]*text_a_len\
                        +[[0 if mask_padding_with_zero else 1]*padding_length+[1 if mask_padding_with_zero else 0]*text_a_len+[0 if mask_padding_with_zero else 1]*text_b_len]*text_b_len
            align_mask2 =[[0 if mask_padding_with_zero else 1] *len(input_ids2)]*padding_length2\
                        +[[0 if mask_padding_with_zero else 1]*(padding_length2+text_a_len)+[1 if mask_padding_with_zero else 0]*text_c_len]*text_a_len\
                        +[[0 if mask_padding_with_zero else 1]*padding_length2+[1 if mask_padding_with_zero else 0]*text_a_len+[0 if mask_padding_with_zero else 1]*text_c_len]*text_c_len
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            input_ids2 = input_ids2 + ([pad_token] * padding_length2)

            attention_mask = attention_mask + (
                [0 if mask_padding_with_zero else 1] * padding_length)
            attention_mask2 = attention_mask2 + (
                [0 if mask_padding_with_zero else 1] * padding_length2)

            token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                               padding_length)
            token_type_ids2 = token_type_ids2 + ([pad_token_segment_id] *
                                                 padding_length2)

            align_mask =[[0 if mask_padding_with_zero else 1]*text_a_len+[1 if mask_padding_with_zero else 0]*text_b_len+[0 if mask_padding_with_zero else 1]*padding_length]*text_a_len\
                        +[[1 if mask_padding_with_zero else 0]*text_a_len+[0 if mask_padding_with_zero else 1]*(text_b_len+padding_length)]*text_b_len \
                        +[[0 if mask_padding_with_zero else 1] * len(input_ids)] * padding_length
            align_mask2 = [[0 if mask_padding_with_zero else 1] * text_a_len + [
                1 if mask_padding_with_zero else 0] * text_c_len + [
                              0 if mask_padding_with_zero else 1] * padding_length2] * text_a_len \
                         + [[1 if mask_padding_with_zero else 0] * text_a_len + [0 if mask_padding_with_zero else 1] * (
                        text_c_len + padding_length2)] * text_c_len \
                         + [[0 if mask_padding_with_zero else 1] * len(input_ids2)] * padding_length2

        assert len(input_ids
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(input_ids), max_length)
        assert len(attention_mask
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(attention_mask), max_length)
        assert len(align_mask[0]
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(align_mask[0]), max_length)
        assert len(token_type_ids
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(token_type_ids), max_length)

        assert len(input_ids2
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(input_ids), max_length)
        assert len(attention_mask2
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(attention_mask), max_length)
        assert len(align_mask2[0]
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(align_mask[0]), max_length)
        assert len(token_type_ids2
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(token_type_ids), max_length)

        if example.label is not None:
            if output_mode == "classification":
                label = label_map[example.label]
            elif output_mode == "regression":
                label = float(example.label)
            else:
                raise KeyError(output_mode)
        else:
            label = None

        if example.label2 is not None:
            if output_mode == "classification":
                label2 = label_map[example.label2]
            elif output_mode == "regression":
                label2 = float(example.label2)
            else:
                raise KeyError(output_mode)
        else:
            label2 = None

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x)
                                                    for x in input_ids]))
            logger.info("attention_mask: %s" %
                        " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" %
                        " ".join([str(x) for x in token_type_ids]))
            logger.info("input_ids2: %s" %
                        " ".join([str(x) for x in input_ids2]))
            logger.info("attention_mask2: %s" %
                        " ".join([str(x) for x in attention_mask2]))
            logger.info("token_type_ids2: %s" %
                        " ".join([str(x) for x in token_type_ids2]))
            if label is not None:
                logger.info("label: %s (id = %d)" % (example.label, label))
            if label2 is not None:
                logger.info("label2: %s (id = %d)" % (example.label2, label))

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          align_mask=align_mask,
                          token_type_ids=token_type_ids,
                          label=label,
                          input_ids2=input_ids2,
                          attention_mask2=attention_mask2,
                          align_mask2=align_mask2,
                          token_type_ids2=token_type_ids2,
                          label2=label2))
    return features
def glue_convert_examples_to_features(
    examples,
    tokenizer,
    max_length=512,
    task=None,
    label_list=None,
    output_mode=None,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    mask_padding_with_zero=True,
):
    """
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """
    is_tf_dataset = False
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        is_tf_dataset = True

    if task is not None:
        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" %
                        (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        len_examples = 0
        if is_tf_dataset:
            example = processor.get_example_from_tensor_dict(example)
            example = processor.tfds_map(example)
            len_examples = tf.data.experimental.cardinality(examples)
        else:
            len_examples = len(examples)
        if ex_index % 10000 == 0:
            logger.info("Writing example %d/%d" % (ex_index, len_examples))

        inputs = tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs[
            "token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] *
                              padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] *
                              padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + (
                [0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                               padding_length)

        assert len(input_ids
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(input_ids), max_length)
        assert len(attention_mask
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(attention_mask), max_length)
        assert len(token_type_ids
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(token_type_ids), max_length)

        if output_mode == "classification":
            label = label_map[example.label]
        elif output_mode == "regression":
            label = float(example.label)
        else:
            raise KeyError(output_mode)

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x)
                                                    for x in input_ids]))
            logger.info("attention_mask: %s" %
                        " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" %
                        " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          label=label))

    if is_tf_available() and is_tf_dataset:

        def gen():
            for ex in features:
                yield (
                    {
                        "input_ids": ex.input_ids,
                        "attention_mask": ex.attention_mask,
                        "token_type_ids": ex.token_type_ids,
                    },
                    ex.label,
                )

        return tf.data.Dataset.from_generator(
            gen,
            ({
                "input_ids": tf.int32,
                "attention_mask": tf.int32,
                "token_type_ids": tf.int32
            }, tf.int64),
            (
                {
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None]),
                },
                tf.TensorShape([]),
            ),
        )

    return features
Exemple #5
0
def squad_convert_examples_to_features(
    examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False, threads=1
):
    """
    Converts a list of examples into a list of features that can be directly given as input to a model.
    It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.

    Args:
        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
        tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
        max_seq_length: The maximum sequence length of the inputs.
        doc_stride: The stride used when the context is too large and is split across several features.
        max_query_length: The maximum length of the query.
        is_training: whether to create features for model evaluation or model training.
        return_dataset: Default False. Either 'pt' or 'tf'.
            if 'pt': returns a torch.data.TensorDataset,
            if 'tf': returns a tf.data.Dataset
        threads: multiple processing threadsa-smi


    Returns:
        list of :class:`~transformers.data.processors.squad.SquadFeatures`

    Example::

        processor = SquadV2Processor()
        examples = processor.get_dev_examples(data_dir)

        features = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
        )
    """

    # Defining helper methods
    features = []
    threads = min(threads, cpu_count())
    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
        annotate_ = partial(
            squad_convert_example_to_features,
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_training=is_training,
        )
        features = list(
            tqdm(
                p.imap(annotate_, examples, chunksize=32),
                total=len(examples),
                desc="convert squad examples to features",
                mininterval=5,
            )
        )
    new_features = []
    unique_id = 1000000000
    example_index = 0
    for example_features in tqdm(features, total=len(features), desc="add example index and unique id",
                                 mininterval=5):
        if not example_features:
            continue
        for example_feature in example_features:
            example_feature.example_index = example_index
            example_feature.unique_id = unique_id
            new_features.append(example_feature)
            unique_id += 1
        example_index += 1
    features = new_features
    del new_features
    if return_dataset == "pt":
        if not is_torch_available():
            raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.")

        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
        all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)

        if not is_training:
            all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
            dataset = TensorDataset(
                all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask
            )
        else:
            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
            dataset = TensorDataset(
                all_input_ids,
                all_attention_masks,
                all_token_type_ids,
                all_start_positions,
                all_end_positions,
                all_cls_index,
                all_p_mask,
                all_is_impossible,
            )

        return features, dataset
    elif return_dataset == "tf":
        if not is_tf_available():
            raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.")

        def gen():
            for ex in features:
                yield (
                    {
                        "input_ids": ex.input_ids,
                        "attention_mask": ex.attention_mask,
                        "token_type_ids": ex.token_type_ids,
                    },
                    {
                        "start_position": ex.start_position,
                        "end_position": ex.end_position,
                        "cls_index": ex.cls_index,
                        "p_mask": ex.p_mask,
                        "is_impossible": ex.is_impossible,
                    },
                )

        return tf.data.Dataset.from_generator(
            gen,
            (
                {"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32},
                {
                    "start_position": tf.int64,
                    "end_position": tf.int64,
                    "cls_index": tf.int64,
                    "p_mask": tf.int32,
                    "is_impossible": tf.int32,
                },
            ),
            (
                {
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None]),
                },
                {
                    "start_position": tf.TensorShape([]),
                    "end_position": tf.TensorShape([]),
                    "cls_index": tf.TensorShape([]),
                    "p_mask": tf.TensorShape([None]),
                    "is_impossible": tf.TensorShape([]),
                },
            ),
        )

    return features
    def get_features(
        self,
        tokenizer,
        max_length=None,
        pad_on_left=False,
        pad_token=0,
        mask_padding_with_zero=True,
        return_tensors=None,
    ):
        """
        Convert examples in a list of ``InputFeatures``

        Args:
            tokenizer: Instance of a tokenizer that will tokenize the examples
            max_length: Maximum example length
            task: GLUE task
            label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
            output_mode: String indicating the output mode. Either ``regression`` or ``classification``
            pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
            pad_token: Padding token
            mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
                and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
                actual values)

        Returns:
            If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
            containing the task-specific features. If the input is a list of ``InputExamples``, will return
            a list of task-specific ``InputFeatures`` which can be fed to the model.

        """
        if max_length is None:
            max_length = tokenizer.max_len

        label_map = {label: i for i, label in enumerate(self.labels)}

        all_input_ids = []
        for (ex_index, example) in enumerate(self.examples):
            if ex_index % 10000 == 0:
                logger.info("Tokenizing example %d", ex_index)

            input_ids = tokenizer.encode(
                example.text_a,
                add_special_tokens=True,
                max_length=min(max_length, tokenizer.max_len),
            )
            all_input_ids.append(input_ids)

        batch_length = max(len(input_ids) for input_ids in all_input_ids)

        features = []
        for (ex_index,
             (input_ids,
              example)) in enumerate(zip(all_input_ids, self.examples)):
            if ex_index % 10000 == 0:
                logger.info("Writing example %d/%d" %
                            (ex_index, len(self.examples)))
            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            attention_mask = [1 if mask_padding_with_zero else 0
                              ] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding_length = batch_length - len(input_ids)
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                attention_mask = ([0 if mask_padding_with_zero else 1] *
                                  padding_length) + attention_mask
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
                attention_mask = attention_mask + (
                    [0 if mask_padding_with_zero else 1] * padding_length)

            assert len(
                input_ids
            ) == batch_length, "Error with input length {} vs {}".format(
                len(input_ids), batch_length)
            assert len(
                attention_mask
            ) == batch_length, "Error with input length {} vs {}".format(
                len(attention_mask), batch_length)

            if self.mode == "classification":
                label = label_map[example.label]
            elif self.mode == "regression":
                label = float(example.label)
            else:
                raise ValueError(self.mode)

            if ex_index < 5 and self.verbose:
                logger.info("*** Example ***")
                logger.info("guid: %s" % (example.guid))
                logger.info("input_ids: %s" %
                            " ".join([str(x) for x in input_ids]))
                logger.info("attention_mask: %s" %
                            " ".join([str(x) for x in attention_mask]))
                logger.info("label: %s (id = %d)" % (example.label, label))

            features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=attention_mask,
                              label=label))

        if return_tensors is None:
            return features
        elif return_tensors == "tf":
            if not is_tf_available():
                raise RuntimeError(
                    "return_tensors set to 'tf' but TensorFlow 2.0 can't be imported"
                )
            import tensorflow as tf

            def gen():
                for ex in features:
                    yield ({
                        "input_ids": ex.input_ids,
                        "attention_mask": ex.attention_mask
                    }, ex.label)

            dataset = tf.data.Dataset.from_generator(
                gen,
                ({
                    "input_ids": tf.int32,
                    "attention_mask": tf.int32
                }, tf.int64),
                ({
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None])
                }, tf.TensorShape([])),
            )
            return dataset
        elif return_tensors == "pt":
            if not is_torch_available():
                raise RuntimeError(
                    "return_tensors set to 'pt' but PyTorch can't be imported")
            import torch
            from torch.utils.data import TensorDataset

            all_input_ids = torch.tensor([f.input_ids for f in features],
                                         dtype=torch.long)
            all_attention_mask = torch.tensor(
                [f.attention_mask for f in features], dtype=torch.long)
            if self.mode == "classification":
                all_labels = torch.tensor([f.label for f in features],
                                          dtype=torch.long)
            elif self.mode == "regression":
                all_labels = torch.tensor([f.label for f in features],
                                          dtype=torch.float)

            dataset = TensorDataset(all_input_ids, all_attention_mask,
                                    all_labels)
            return dataset
        else:
            raise ValueError("return_tensors should be one of 'tf' or 'pt'")