コード例 #1
0
    def __init__(
        self,
        labels: T.List[str],
        tokenizer: PreTrainedTokenizer,
        label_map: T.Dict[str, int],
        dset_filename: str,
        content_column: str,
        label_column: T.Optional[str],
    ):
        """.

        labels: list of valid labels (can be strings/ints)
        tokenizer: AutoTokenizer object that can tokenize input text
        label_map: maps labels to ints for machine-readability
        dset_filename: name of the filename (full filepath) of the dataset being loaded
        content_column: column name of the content to be read
        label_column: column name where the labels can be found
        """
        suffix = dset_filename.split(".")[-1]  # type: ignore

        if suffix in CSV_EXTENSIONS:
            doc_reader = lambda b: pd.read_csv(b, dtype=object)
        else:
            raise ValueError(
                f"The file {dset_filename} doesn't have a recognized extension."
            )

        self.labels = labels
        self.label_map = label_map
        self.tokenizer = tokenizer
        df = doc_reader(dset_filename)  # type: ignore
        self.len_dset = len(df)

        self.content_series = df[
            content_column]  # For later, if we need to output predictions
        self.encoded_content = self.tokenizer.batch_encode_plus(
            df[content_column],
            max_length=None,
            pad_to_max_length=True,
        )
        if label_column is not None:
            self.encoded_labels: T.Optional[T.List[int]] = [
                self.label_map[label] for label in df[label_column]
            ]
        else:
            self.encoded_labels = None
        self.features = []
        for i in range(len(self.encoded_content["input_ids"])):
            inputs = {
                k: self.encoded_content[k][i]
                for k in self.encoded_content.keys()
            }
            if self.encoded_labels is not None:
                feature = InputFeatures(**inputs, label=self.encoded_labels[i])
            else:
                feature = InputFeatures(**inputs, label=None)
            self.features.append(feature)
コード例 #2
0
def convert_examples_to_features(examples, tokenizer, max_length, label_list):
    label_map = {label: i for i, label in enumerate(label_list)}
    features = []
    logging.info('>>> {} examples convert to features'.format(len(examples)))
    for ex_index, example in enumerate(examples):

        inputs = tokenizer.encode_plus(example.text_a,
                                       example.text_b,
                                       add_special_tokens=True,
                                       max_length=max_length,
                                       truncation=True)
        input_ids, token_type_ids, attention_mask = inputs[
            "input_ids"], inputs["token_type_ids"], inputs["attention_mask"]
        padding_length = max_length - len(input_ids)
        input_ids = input_ids + [0] * padding_length
        token_type_ids = token_type_ids + [0] * padding_length
        attention_mask = attention_mask + [0] * padding_length
        assert len(input_ids) == max_length
        label = label_map[example.label]
        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                label=label,
            ))
        if ex_index < 5:
            logging.info(">>> writing example %d" % (ex_index))
            logging.info('>>> text is {} '.format(example.text_a))
            logging.info('>>> input_ids is {}'.format(input_ids))
            logging.info('>>> label text is {} and label_ids is {}'.format(
                example.label, label))
    return features
コード例 #3
0
 def __init__(self, input_ids, attention_masks, labels):
     assert len(input_ids) == len(attention_masks) == len(labels)
     self.features = []
     for index in range(len(labels)):
         feature = InputFeatures(input_ids=input_ids[index],
                                 attention_mask=attention_masks[index],
                                 label=labels[index])
         self.features.append(feature)
コード例 #4
0
ファイル: bert_unsup_embedding.py プロジェクト: xinyangz/ltrn
  def _text_to_features(self, texts: List[str]):
    batch_encoding = self.tokenizer.batch_encode_plus(
        texts, max_length=self.max_length, pad_to_max_length=True)

    features = []
    for i in range(len(texts)):
      inputs = {k: batch_encoding[k][i] for k in batch_encoding}
      feature = InputFeatures(**inputs)
      features.append(feature)
    return features
コード例 #5
0
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = []  # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length,  # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=
            True,  # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True)

        input_ids, token_type_ids, attention_mask = (
            input_dict["input_ids"], input_dict["token_type_ids"],
            input_dict['attention_mask'])

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          label=e.label))

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({
            "input_ids": tf.int32,
            "attention_mask": tf.int32,
            "token_type_ids": tf.int32
        }, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )
コード例 #6
0
def _glue_convert_examples_to_features(
    examples: List[InputExample],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
    task=None,
    label_list=None,
    output_mode=None,
):
    if max_length is None:
        max_length = tokenizer.max_len

    if task is not None:
        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" %
                        (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}

    def label_from_example(example: InputExample) -> Union[int, float]:
        if output_mode == "classification":
            return label_map[example.label]
        elif output_mode == "regression":
            return float(example.label)
        raise KeyError(output_mode)

    labels = [label_from_example(example) for example in examples]

    batch_encoding = tokenizer.batch_encode_plus(
        [(example.text_a, example.text_b) for example in examples],
        max_length=max_length,
        pad_to_max_length=True,
    )

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}

        feature = InputFeatures(**inputs, label=labels[i])
        features.append(feature)

    for i, example in enumerate(examples[:5]):
        logger.info("*** Example ***")
        logger.info("guid: %s" % (example.guid))
        logger.info("features: %s" % features[i])

    return features
コード例 #7
0
def examples2features(examples, tokenizer, label_list, max_length=128):
    label_map = {label: i for i, label in enumerate(label_list)}

    logger.info("Converting examples to features")
    features = []
    for ex_index, example in enumerate(tqdm(examples)):
        inputs = tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
        )

        # Im so sorry for this xD
        (input_ids, token_type_ids) = itemgetter("input_ids",
                                                 "token_type_ids")(inputs)
        attention_mask = [1] * len(input_ids)

        # Pad everything
        pad_token = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
        padding_length = max_length - len(input_ids)

        input_ids = input_ids + ([pad_token] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)

        # Assert that everything was padded correctly
        assert len(input_ids) == max_length
        assert len(token_type_ids) == max_length
        assert len(attention_mask) == max_length

        features.append(
            InputFeatures(
                input_ids,
                attention_mask,
                token_type_ids,
                label=label_map[example.label],
            ))

    # Log some examples to check
    for example, feature in islice(zip(examples, features), 5):
        logger.info("******** Example ********")
        logger.info(f"Guid: {example.guid}")
        logger.info(f"Sentence A: {example.text_a}")
        logger.info(f"Sentence B: {example.text_b}")
        logger.info(f"input_ids: {feature.input_ids}")
        logger.info(f"attention_mask: {feature.attention_mask}")
        logger.info(f"token_type_ids: {feature.token_type_ids}")
        logger.info(f"label: {example.label} (id = {feature.label})")

    return features
コード例 #8
0
ファイル: data_utils.py プロジェクト: queqinyu/EACL
def classification_convert_example_to_feature(example,
                                              max_length=512,
                                              label_map=None,
                                              pad_on_left=False,
                                              pad_token=0,
                                              pad_token_segment_id=0,
                                              mask_padding_with_zero=True,
                                              set_type='train'):
    inputs = tokenizer.encode_plus(example.text_a,
                                   example.text_b,
                                   add_special_tokens=True,
                                   max_length=max_length,
                                   return_token_type_ids=True)
    input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
    attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
    # Zero-pad up to the sequence length.
    padding_length = max_length - len(input_ids)
    if pad_on_left:
        input_ids = ([pad_token] * padding_length) + input_ids
        attention_mask = ([0 if mask_padding_with_zero else 1] *
                          padding_length) + attention_mask
        token_type_ids = ([pad_token_segment_id] *
                          padding_length) + token_type_ids
    else:
        input_ids = input_ids + ([pad_token] * padding_length)
        attention_mask = attention_mask + (
            [0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                           padding_length)

    assert len(
        input_ids) == max_length, "Error with input length {} vs {}".format(
            len(input_ids), max_length)
    assert len(attention_mask
               ) == max_length, "Error with input length {} vs {}".format(
                   len(attention_mask), max_length)
    assert len(token_type_ids
               ) == max_length, "Error with input length {} vs {}".format(
                   len(token_type_ids), max_length)
    if set_type != 'test':
        label = label_map[example.label]
    else:
        label = None

    return InputFeatures(input_ids=input_ids,
                         attention_mask=attention_mask,
                         token_type_ids=token_type_ids,
                         label=label)
コード例 #9
0
ファイル: predict_utils.py プロジェクト: queqinyu/EACL
def create_features(examples, tokenizer, max_len):
    features = []
    pad_on_left = False
    pad_token = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]
    pad_token_segment_id = 0
    mask_padding_with_zero = True
    for example in tqdm(examples, desc='convert examples to features'):
        inputs = tokenizer.encode_plus(example.text_a,
                                       example.text_b,
                                       add_special_tokens=True,
                                       max_length=max_len,
                                       return_token_type_ids=True)
        input_ids, token_type_ids = inputs["input_ids"], inputs[
            "token_type_ids"]
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
        # Zero-pad up to the sequence length.
        padding_length = max_len - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] *
                              padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] *
                              padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + (
                [0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                               padding_length)
        assert len(
            input_ids) == max_len, "Error with input length {} vs {}".format(
                len(input_ids), max_len)
        assert len(attention_mask
                   ) == max_len, "Error with input length {} vs {}".format(
                       len(attention_mask), max_len)
        assert len(token_type_ids
                   ) == max_len, "Error with input length {} vs {}".format(
                       len(token_type_ids), max_len)
        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          label=None))
    return features
コード例 #10
0
def convert_examples_to_features(examples, tokenizer,
                                 max_length=512,
                                 pad_on_left=False,
                                 pad_token=0,
                                 pad_token_segment_id=0,
                                 mask_padding_with_zero=True):
    features = []
    for (ex_index, example) in tqdm(enumerate(examples)):
        inputs = tokenizer.encode_plus(
            example[COMPLAINT_TEXT],
            add_special_tokens=True,
            max_length=max_length,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask),
                                                                                            max_length)
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids),
                                                                                            max_length)

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          label=example[LABEL]))
    return features
コード例 #11
0
def convert_examples_to_features(
    examples: List[InputExample],
    tokenizer: AutoTokenizer,
    max_length: Optional[int] = None,
    label_list: List = None,
    output_mode="classification",
):
    if max_length is None:
        max_length = tokenizer.max_len
    label_map = {label: i for i, label in enumerate(label_list)}

    def label_from_example(example: InputExample) -> Union[int, float, None]:
        if example.label is None:
            return None
        if output_mode == "classification":
            return label_map[example.label]
        elif output_mode == "regression":
            return float(example.label)
        raise KeyError(output_mode)

    labels = [label_from_example(example) for example in examples]

    batch_encoding = tokenizer(
        [(example.text_a, example.text_b) for example in examples],
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}

        feature = InputFeatures(**inputs, label=labels[i])
        features.append(feature)

    for i, example in enumerate(examples[:5]):
        logger.info("*** Example ***")
        logger.info("guid: %s" % (example.guid))
        logger.info("features: %s" % features[i])

    return features
コード例 #12
0
def retrieval_examples_to_features(examples, tokenizer, max_length):
    batch_encoding = tokenizer.batch_encode_plus(
        [(example.text_a, example.text_b) for example in examples],
        max_length=max_length,
        pad_to_max_length=True,
    )

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}

        feature = InputFeatures(**inputs)
        features.append(feature)

    for i, example in enumerate(examples[:5]):
        logger.info("*** Example ***")
        logger.info("guid: %s" % (example.guid))
        logger.info("features: %s" % features[i])

    return features
コード例 #13
0
ファイル: STSBenchmark.py プロジェクト: EUFrankie/frankie-ai
    def _convert_examples_to_features(
        self,
        examples,
    ):
        labels = [float(example.label) for example in examples]

        batch_encoding = self.tokenizer.batch_encode_plus(
            [(example.text_a, example.text_b) for example in examples],
            max_length=self.max_length,
            pad_to_max_length=True,
        )

        features = []
        for i in range(len(examples)):
            inputs = {k: batch_encoding[k][i] for k in batch_encoding}

            feature = InputFeatures(**inputs, label=labels[i])
            features.append(feature)

        return features
コード例 #14
0
    def convert_examples_to_features(
        examples: List[InputExample],
        tokenizer: PreTrainedTokenizer,
        max_length: Optional[int] = None,
        task=None,
        label_list=None,
        output_mode=None,
    ):
        if max_length is None:
            max_length = tokenizer.max_len

        processor = TwitterProcessor()
        label_list = processor.get_labels()

        label_map = {label: i for i, label in enumerate(label_list)}

        def label_from_example(
                example: InputExample) -> Union[int, float, None]:
            return label_map[example.label]

        labels = [label_from_example(example) for example in examples]

        batch_encoding = tokenizer.batch_encode_plus(
            [(example.text_a, example.text_b) for example in examples],
            max_length=max_length,
            pad_to_max_length=True,
        )

        features = []
        for i in range(len(examples)):
            inputs = {k: batch_encoding[k][i] for k in batch_encoding}

            feature = InputFeatures(**inputs, label=labels[i])
            features.append(feature)

        for i, example in enumerate(examples[:5]):
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("features: %s" % features[i])

        return features
コード例 #15
0
ファイル: some.py プロジェクト: kokeman/SOME
    def convert_examples_to_features(
        self,
        examples,
        tokenizer,
        max_length=None,
        task=None,
        label_list=None,
        output_mode=None,
    ):
        if max_length is None:
            max_length = tokenizer.max_len

        label_map = {label: i for i, label in enumerate(label_list)}

        def label_from_example(example: InputExample):
            if example.label is None:
                return None
            elif output_mode == 'classification':
                return label_map[example.label]
            elif output_mode == 'regression':
                return float(example.label)
            raise KeyError(output_mode)

        labels = [label_from_example(example) for example in examples]

        batch_encoding = tokenizer.batch_encode_plus(
            [(example.text_a, example.text_b) for example in examples],
            max_length=max_length,
            pad_to_max_length=True,
        )

        features = []
        for i in range(len(examples)):
            inputs = {k: batch_encoding[k][i] for k in batch_encoding}

            feature = InputFeatures(**inputs, label=labels[i])
            features.append(feature)

        return features
コード例 #16
0
    def process_inputs(self, texts, labels=None, to_dataset=True):
        """
        convert text to tf dataset used as model input (e.g. for training)
        """
        if labels is None:
            labels = repeat(0)

        # tokenize
        tokenized = []
        for text, label in zip(texts, labels):
            inputs = (self.tokenizer.encode_plus(text,
                                                 add_special_tokens=True,
                                                 max_length=self.max_length,
                                                 pad_to_max_length=True))

            tokenized.append(
                InputFeatures(input_ids=inputs['input_ids'],
                              attention_mask=inputs['attention_mask'],
                              token_type_ids=inputs['token_type_ids'],
                              label=label))

        if to_dataset:
            tokenized = self.to_dataset(tokenized)
        return tokenized
コード例 #17
0
    def __convert_examples_to_tf_dataset(self, data, max_length=128):
        """
    Performs the tokenization where each word of each document has a max_length
      and returns a tensorflow dataset.
    Every element of the dataset consists of:
      1. a dict with the tokenized text and the attention mask, used
          to specify which tokens are valid and which ones are used for padding
      2. the tweet label

    This format is known and used by Bert

    :param data: input data. A list of InputExample objects.
    :type data: list
    :param max_length: fixed length of the tokenization
    :type max_length: int, optiona
    :return: a tensorflow dataset as described before
    :rtype: tf.data.Dataset
    """

        # A list of InputFeatures of a single tweet. Every feature contains:
        # tweet's tokens, tweet's attention mask, tweet's label.
        # For more info: https://huggingface.co/transformers/main_classes/processors.html#transformers.data.processors.utils.InputFeatures
        features = []

        for sample in data:
            # For every tweet creates a dictionary. This dictionary contains tweet's
            # tokens ('input_ids') and the tweet's attention mask ('attention mask').
            input_dict = self.__tokenizer(
                # The tweet itself. Remember that the sample is an InputExample.
                sample.text_a,
                # Specify to add the padding
                add_special_tokens=True,
                # Fixed tweet vector length
                max_length=max_length,
                # Not needed because we are not comparing text_a to text_b,
                # since we don't have a text_b.
                # For more info: https://huggingface.co/transformers/glossary.html#token-type-ids
                return_token_type_ids=False,
                # Specify to return a binary vector of lenght = max_length. The vector
                # takes 1 when the corresponding token in the tweet representation is
                # valid, 0 if it is a special character used for padding.
                # For more info: https://huggingface.co/transformers/glossary.html#attention-mask
                return_attention_mask=True,
                # Padding added to the right
                padding='max_length',
                # Truncate the tweet if it is longer than 128 words
                truncation=True)

            input_ids, attention_mask = (input_dict['input_ids'],
                                         input_dict['attention_mask'])

            # For every tweet it creates an object of type InputFeatures
            # and adds it to the list.
            features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=attention_mask,
                              label=sample.label))

        # Creating a generator to convert the features list into a tensorflow dataset.
        def gen():
            for f in features:
                yield (
                    {
                        'input_ids': f.input_ids,
                        'attention_mask': f.attention_mask,
                    },
                    f.label,
                )

        # Returns the dataset from the generator.
        # For more info: https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator
        return tf.data.Dataset.from_generator(
            gen,
            ({
                'input_ids': tf.int32,
                'attention_mask': tf.int32,
            }, tf.int64),
            (
                {
                    'input_ids': tf.TensorShape([None]),
                    'attention_mask': tf.TensorShape([None]),
                },
                tf.TensorShape([]),
            ),
        )
コード例 #18
0
ファイル: glue_mem.py プロジェクト: puleon/mem_tokens
    def __init__(
        self,
        args: GlueMemDataTrainingArguments,
        tokenizer: PreTrainedTokenizer,
        mem_size=20,
        limit_length: Optional[int] = None,
        mode: Union[str, Split] = Split.train,
        cache_dir: Optional[str] = None,
    ):
        self.args = args
        self.processor = glue_processors[args.task_name]()
        self.output_mode = glue_output_modes[args.task_name]
        if isinstance(mode, str):
            try:
                mode = Split[mode]
            except KeyError:
                raise KeyError("mode is not a valid split name")
        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else args.data_dir,
            "cached_mem_{}_{}_{}_{}".format(
                mode.value,
                tokenizer.__class__.__name__,
                str(args.max_seq_length),
                args.task_name,
            ),
        )
        label_list = self.processor.get_labels()
        if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
            RobertaTokenizer,
            RobertaTokenizerFast,
            XLMRobertaTokenizer,
            BartTokenizer,
            BartTokenizerFast,
        ):
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1]
        self.label_list = label_list

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(cached_features_file) and not args.overwrite_cache:
                start = time.time()
                self.features = torch.load(cached_features_file)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )
            else:
                logger.info(f"Creating features from dataset file at {args.data_dir}")

                if mode == Split.dev:
                    examples = self.processor.get_dev_examples(args.data_dir)
                elif mode == Split.test:
                    examples = self.processor.get_test_examples(args.data_dir)
                else:
                    examples = self.processor.get_train_examples(args.data_dir)
                if limit_length is not None:
                    examples = examples[:limit_length]
                self.features = glue_convert_examples_to_features(
                    examples,
                    tokenizer,
                    max_length=args.max_seq_length - args.mem_size,
                    label_list=label_list,
                    output_mode=self.output_mode,
                )
                mem_id = tokenizer.added_tokens_encoder['[mem]']
                input_ids = [el.input_ids for el in self.features]
                input_ids = [[el[0]] + args.mem_size*[mem_id] + el[1:] for el in input_ids]
                attention_mask = [el.attention_mask for el in self.features]
                attention_mask = [args.mem_size*[1] + el for el in attention_mask]
                token_type_ids = [el.token_type_ids for el in self.features]
                token_type_ids = [args.mem_size*[0] + el for el in token_type_ids]
                labels = [el.label for el in self.features]
                self.features = [InputFeatures(input_ids=el[0], attention_mask=el[1], token_type_ids=el[2], label=el[3])
                                 for el in zip(input_ids, attention_mask, token_type_ids, labels)]
                start = time.time()
                torch.save(self.features, cached_features_file)
                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )
コード例 #19
0
def convert_examples_to_features(
    examples,
    tokenizer,
    processor,
    max_length=512,
    task=None,
    label_list=None,
    output_mode=None,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    mask_padding_with_zero=True,
):
    """
    Adapted from glue_convert_examples_to_features from transformers
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples``.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """
    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))

        inputs = tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
        )
        input_ids, token_type_ids = (
            inputs["input_ids"],
            inputs["token_type_ids"],
        )

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] *
                              padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] *
                              padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + (
                [0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                               padding_length)

        assert (len(input_ids) == max_length
                ), "Error with input length {} vs {}".format(
                    len(input_ids), max_length)
        assert (len(attention_mask) == max_length
                ), "Error with input length {} vs {}".format(
                    len(attention_mask), max_length)
        assert (len(token_type_ids) == max_length
                ), "Error with input length {} vs {}".format(
                    len(token_type_ids), max_length)

        if output_mode == "classification":
            label = label_map[example.label]
        elif output_mode == "regression":
            label = float(example.label)
        else:
            raise KeyError(output_mode)

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x)
                                                    for x in input_ids]))
            logger.info("attention_mask: %s" %
                        " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" %
                        " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))

        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                label=label,
            ))

    return features
コード例 #20
0
ファイル: cite.py プロジェクト: maclaughlin/CDA
def glue_convert_examples_to_features(
    examples,
    tokenizer,
    max_length=512,
    task=None,
    label_list=None,
    output_mode=None,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    mask_padding_with_zero=True,
):
    """
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """
    logger.info("I am using the right script!")
    is_tf_dataset = False
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        is_tf_dataset = True

    if task is not None:
        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" %
                        (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        len_examples = 0
        if is_tf_dataset:
            example = processor.get_example_from_tensor_dict(example)
            example = processor.tfds_map(example)
            len_examples = tf.data.experimental.cardinality(examples)
        else:
            len_examples = len(examples)
        if ex_index % 10000 == 0:
            logger.info("Writing example %d/%d" % (ex_index, len_examples))
        input_ids_1, attention_mask_1, token_type_ids_1 = encode_text(
            tokenizer, pad_token, example.text_a, [], [], [])
        input_ids_2, attention_mask_2, token_type_ids_2 = encode_text(
            tokenizer, pad_token, example.text_b, [], [], [])
        input_ids = (input_ids_1, input_ids_2)
        attention_mask = (attention_mask_1, attention_mask_2)
        token_type_ids = (token_type_ids_1, token_type_ids_2)
        if output_mode == "classification":
            label = label_map[example.label]
        elif output_mode == "regression":
            label = float(example.label)
        else:
            raise KeyError(output_mode)

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x)
                                                    for x in input_ids]))
            logger.info("input_len: %s" %
                        " ".join([str(len(x)) for x in input_ids]))
            logger.info("attention_mask: %s" %
                        " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" %
                        " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          label=label))

    if is_tf_available() and is_tf_dataset:

        def gen():
            for ex in features:
                yield (
                    {
                        "input_ids": ex.input_ids,
                        "attention_mask": ex.attention_mask,
                        "token_type_ids": ex.token_type_ids,
                    },
                    ex.label,
                )

        return tf.data.Dataset.from_generator(
            gen,
            ({
                "input_ids": tf.int32,
                "attention_mask": tf.int32,
                "token_type_ids": tf.int32
            }, tf.int64),
            (
                {
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None]),
                },
                tf.TensorShape([]),
            ),
        )

    return features
コード例 #21
0
ファイル: utils_nsmc.py プロジェクト: kkimlee/NLU-with-STT
def convert_examples_to_features(
        examples: List[InputExample],
        max_seq_len: int,
        tokenizer: PreTrainedTokenizer,
        cls_token_segment_id=0,
        pad_token_segment_id=0,
        sequence_a_segment_id=0,
        mask_padding_with_zero=True) -> List[InputFeatures]:
    """ Loads a data file into a list of `InputFeatures`"""
    # Setting based on the current model type
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    pad_token_id = tokenizer.pad_token_id

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens = tokenizer.tokenize(example.text_a)

        # Account for [CLS] and [SEP]
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]

        # Add [SEP] token
        tokens += [sep_token]
        token_type_ids = [sequence_a_segment_id] * len(tokens)

        # Add [CLS] token
        tokens = [cls_token] + tokens
        token_type_ids = [cls_token_segment_id] + token_type_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + (
            [0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                           padding_length)

        assert len(input_ids
                   ) == max_seq_len, "Error with input length {} vs {}".format(
                       len(input_ids), max_seq_len)
        assert len(
            attention_mask
        ) == max_seq_len, "Error with attention mask length {} vs {}".format(
            len(attention_mask), max_seq_len)
        assert len(
            token_type_ids
        ) == max_seq_len, "Error with token type length {} vs {}".format(
            len(token_type_ids), max_seq_len)

        label_id = example.label

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          label=label_id))

    return features
コード例 #22
0
ファイル: glue.py プロジェクト: sinhlt58/submit-zalo
def glue_convert_examples_to_features(examples,
                                      tokenizer,
                                      max_length=512,
                                      task=None,
                                      label_list=None,
                                      output_mode=None,
                                      pad_on_left=False,
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
    """
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """
    is_tf_dataset = False
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        is_tf_dataset = True

    if task is not None:
        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" %
                        (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))
        if is_tf_dataset:
            example = processor.get_example_from_tensor_dict(example)

        inputs = tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs[
            "token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] *
                              padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] *
                              padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + (
                [0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                               padding_length)

        assert len(input_ids
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(input_ids), max_length)
        assert len(attention_mask
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(attention_mask), max_length)
        assert len(token_type_ids
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(token_type_ids), max_length)

        if output_mode == "classification":
            label = label_map[example.label]
        elif output_mode == "regression":
            label = float(example.label)
        else:
            raise KeyError(output_mode)

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x)
                                                    for x in input_ids]))
            logger.info("attention_mask: %s" %
                        " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" %
                        " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          label=label))

    if is_tf_available() and is_tf_dataset:

        def gen():
            for ex in features:
                yield ({
                    'input_ids': ex.input_ids,
                    'attention_mask': ex.attention_mask,
                    'token_type_ids': ex.token_type_ids
                }, ex.label)

        return tf.data.Dataset.from_generator(gen, ({
            'input_ids': tf.int32,
            'attention_mask': tf.int32,
            'token_type_ids': tf.int32
        }, tf.int64), ({
            'input_ids': tf.TensorShape([None]),
            'attention_mask': tf.TensorShape([None]),
            'token_type_ids': tf.TensorShape([None])
        }, tf.TensorShape([])))

    return features
コード例 #23
0
    def _convert_examples_to_features(self,
                                      mode: str,
                                      tokenizer: PreTrainedTokenizer,
                                      return_dataset: str = "tf"):
        features = []

        for (ex_index, example) in enumerate(self.examples[mode]):
            if ex_index % 10000 == 0:
                logger.info("Tokenizing example %d", ex_index)

            # This can now be done in one batch (see transformers)
            # and will be sped up even further in the coming months.
            feature = tokenizer.encode_plus(example.text_a,
                                            example.text_b,
                                            add_special_tokens=True,
                                            max_length=self.max_seq_length,
                                            pad_to_max_length=True)
            label = self.labels.index(example.label)

            assert len(feature["input_ids"]) == self.max_seq_length
            assert len(feature["attention_mask"]) == self.max_seq_length
            assert len(feature["token_type_ids"]) == self.max_seq_length

            if ex_index < 5:
                logger.info("*** Example ***")
                logger.info("guid: %s" % (example.guid))
                logger.info("input_ids: %s" %
                            " ".join([str(x) for x in feature["input_ids"]]))
                logger.info(
                    "attention_mask: %s" %
                    " ".join([str(x) for x in feature["attention_mask"]]))
                logger.info(
                    "token_type_ids: %s" %
                    " ".join([str(x) for x in feature["token_type_ids"]]))
                logger.info("label: %s (id = %d)" % (example.label, label))

            features.append(
                InputFeatures(input_ids=feature["input_ids"],
                              attention_mask=feature["attention_mask"],
                              token_type_ids=feature["token_type_ids"],
                              label=label))
        if len(features) == 0:
            return None

        if return_dataset == "tf":
            if not is_tf_available():
                raise RuntimeError(
                    "return_dataset set to 'tf' but TensorFlow 2.0 can't be imported"
                )

            import tensorflow as tf

            def gen():
                for ex in features:
                    yield ({
                        "input_ids": ex.input_ids,
                        "attention_mask": ex.attention_mask,
                        "token_type_ids": ex.token_type_ids
                    }, ex.label)

            dataset = tf.data.Dataset.from_generator(
                gen,
                ({
                    "input_ids": tf.int32,
                    "attention_mask": tf.int32,
                    "token_type_ids": tf.int32
                }, tf.int64),
                ({
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None])
                }, tf.TensorShape([])),
            )

            return dataset
        elif return_dataset == "pt":
            if not is_torch_available():
                raise RuntimeError(
                    "return_dataset set to 'pt' but PyTorch can't be imported")

            import torch
            from torch.utils.data import TensorDataset

            all_input_ids = torch.tensor([f.input_ids for f in features],
                                         dtype=torch.long)
            all_attention_mask = torch.tensor(
                [f.attention_mask for f in features], dtype=torch.long)
            token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                          dtype=torch.long)
            all_labels = torch.tensor([f.label for f in features],
                                      dtype=torch.long)
            dataset = TensorDataset(all_input_ids, all_attention_mask,
                                    token_type_ids, all_labels)

            return dataset
        else:
            raise ValueError("return_dataset should be one of 'tf' or 'pt'")
コード例 #24
0
    def _convert_examples_to_features(self,
                                      mode: str,
                                      tokenizer: PreTrainedTokenizer,
                                      return_dataset: str = "tf"):
        features = []

        for (ex_index, example) in enumerate(self.examples[mode]):
            if ex_index % 10000 == 0:
                logger.info("Tokenizing example %d", ex_index)

            tokens = []
            label_ids = []
            for word, label in zip(example.text_a, example.label):
                word_tokens = tokenizer.tokenize(word)

                if len(word_tokens) > 0:
                    tokens.extend(word_tokens)
                    label_ids.extend([self.labels.index(label)] + [-1] *
                                     (len(word_tokens) - 1))

            special_tokens_count = tokenizer.num_special_tokens_to_add()

            if len(tokens) > self.max_seq_length - 2:
                tokens = tokens[:(self.max_seq_length - special_tokens_count)]
                label_ids = label_ids[:(self.max_seq_length -
                                        special_tokens_count)]

            tokens += [tokenizer.sep_token]
            label_ids += [-1]
            segment_ids = [0] * len(tokens)
            tokens = [tokenizer.cls_token] + tokens
            label_ids = [-1] + label_ids
            segment_ids = [0] + segment_ids

            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)
            padding_length = self.max_seq_length - len(input_ids)
            input_ids += [tokenizer.pad_token_id] * padding_length
            input_mask += [0] * padding_length
            segment_ids += [tokenizer.pad_token_type_id] * padding_length
            label_ids += [-1] * padding_length

            assert len(input_ids) == self.max_seq_length
            assert len(input_mask) == self.max_seq_length
            assert len(segment_ids) == self.max_seq_length
            assert len(label_ids) == self.max_seq_length

            if ex_index < 5:
                logger.info("*** Example ***")
                logger.info("guid: %s" % (example.guid))
                logger.info("input_ids: %s" %
                            " ".join([str(x) for x in input_ids]))
                logger.info("attention_mask: %s" %
                            " ".join([str(x) for x in input_mask]))
                logger.info("token_type_ids: %s" %
                            " ".join([str(x) for x in segment_ids]))
                logger.info("label: %s " % (label_ids))

            features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=input_mask,
                              token_type_ids=segment_ids,
                              label=label_ids))
        if len(features) == 0:
            return None

        if return_dataset == "tf":
            if not is_tf_available():
                raise RuntimeError(
                    "return_dataset set to 'tf' but TensorFlow 2.0 can't be imported"
                )

            import tensorflow as tf

            def gen():
                for ex in features:
                    yield ({
                        "input_ids": ex.input_ids,
                        "attention_mask": ex.attention_mask,
                        "token_type_ids": ex.token_type_ids
                    }, ex.label)

            dataset = tf.data.Dataset.from_generator(
                gen,
                ({
                    "input_ids": tf.int32,
                    "attention_mask": tf.int32,
                    "token_type_ids": tf.int32
                }, tf.int64),
                ({
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None])
                }, tf.TensorShape([None])),
            )

            return dataset
        elif return_dataset == "pt":
            if not is_torch_available():
                raise RuntimeError(
                    "return_dataset set to 'pt' but PyTorch can't be imported")

            import torch
            from torch.utils.data import TensorDataset

            all_input_ids = torch.tensor([f.input_ids for f in features],
                                         dtype=torch.long)
            all_attention_mask = torch.tensor(
                [f.attention_mask for f in features], dtype=torch.long)
            token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                          dtype=torch.long)
            all_labels = torch.tensor([f.label for f in features],
                                      dtype=torch.long)
            dataset = TensorDataset(all_input_ids, all_attention_mask,
                                    token_type_ids, all_labels)

            return dataset
        else:
            raise ValueError("return_dataset should be one of 'tf' or 'pt'")
コード例 #25
0
def convert_examples_to_features(
    examples: List[InputExample],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
    task=None,
    label_list=None,
    output_mode=None,
):
    if max_length is None:
        max_length = tokenizer.max_len

    label_map = {label: i for i, label in enumerate(label_list)}

    def label_from_example(example: InputExample) -> Union[int, float, None]:
        if example.label is None:
            return None
        if output_mode == "classification":
            return label_map[example.label]
        elif output_mode == "regression":
            return float(example.label)
        raise KeyError(output_mode)

    labels = [label_from_example(example) for example in examples]
    batch_encoding = {}
    batch_encoding["input_ids"] = []
    batch_encoding["attention_mask"] = []
    batch_encoding["token_type_ids"] = []

    # batch_encoding = tokenizer(
    #     [(example.text_a, example.text_b) for example in examples],
    #     max_length=max_length,
    #     padding="max_length",
    #     truncation=True,
    # )
    # print(batch_encoding)
    def _encode(x, max_length, doc=False):
        input_ids = tokenizer.encode(x,
                                     add_special_tokens=False,
                                     max_length=max_length)
        padding_length = max_length - len(input_ids) - 2
        attention_mask = [1] * len(input_ids) + [0] * padding_length
        input_ids = input_ids + [103] * padding_length
        # if not doc:
        #     input_ids = [101] + input_ids + [102]
        #     attention_mask = [1] + attention_mask + [1]
        # else:
        #     input_ids = input_ids + [102]
        #     attention_mask = attention_mask + [102]
        return input_ids, attention_mask

    for example in examples:
        x, y = example.text_a, example.text_b

        ids1, mask1 = _encode(x, max_length=20)
        ids1 = [101] + ids1 + [102]
        mask1 = [1] + mask1 + [1]
        tids1 = [0] * len(ids1)

        ids2, mask2 = _encode(y, max_length=489)
        ids2 = ids2 + [102]
        mask2 = mask2 + [1]
        tids2 = [1] * len(ids2)

        input_ids = ids1 + ids2
        attention_mask = mask1 + mask2
        token_type_ids = tids1 + tids2

        batch_encoding["input_ids"].append(input_ids)
        batch_encoding["attention_mask"].append(attention_mask)
        batch_encoding["token_type_ids"].append(token_type_ids)

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}

        feature = InputFeatures(**inputs, label=labels[i])
        features.append(feature)

    for i, example in enumerate(examples[:5]):
        logging.info("*** Example ***")
        logging.info("guid: %s" % (example.guid))
        logging.info("features: %s" % features[i])

    return features
コード例 #26
0
              # if ex_index > 100:
              #   break
              # if len(full_length_encoded) < 5000:
              #   continue
                ############
              

              input_ids_list = []
              attention_mask_list = []
              token_type_ids_list = []

              input_ids_list = full_length_encoded

              feature_v.append(
                  InputFeatures(
                      input_ids=input_ids_list, attention_mask=attention_mask_list, token_type_ids=token_type_ids_list, label=1 if row.scum else 0
                  )
              )

        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save(features, cached_features_file)

    from torch.utils.data import Dataset
    class MyDS(Dataset):
      def __init__(self, features):
        self.features=features
      def __getitem__(self, idx):
        return self.features[idx]
      def __len__(self):
        return len(self.features)