Example #1
0
 def build_datasets(
         self) -> Dict[str, Union[datasets.Dataset, datasets.DatasetDict]]:
     tokenized_datasets = {}
     for split in ["train", "validation"]:
         tokenized_datasets[split] = self.raw_datasets[split].map(
             functools.partial(
                 self.data_processors.prepare_features,
                 split,
                 self.data_config,
                 self.tokenizer,
                 self.column_names,
             ),
             batched=True,
             num_proc=self.data_config.preprocessing_num_workers,
             remove_columns=self.column_names,
             load_from_cache_file=not self.data_config.overwrite_cache,
         )
         hf.remove_unused_columns(self.model, tokenized_datasets[split])
     if self.data_config.pad_to_max_length:
         self.collator = transformers.default_data_collator
     else:
         collator = transformers.DataCollatorWithPadding(
             self.tokenizer,
             pad_to_multiple_of=8 if self.hparams.use_apex_amp else None)
         self.collator = lambda x: collator(x).data
     return tokenized_datasets
Example #2
0
def build_tokenized_datasets(
    raw_datasets: Union[hf_datasets.DatasetDict, hf_datasets.Dataset],
    model: torch.nn.Module,
    data_config: Union[Dict, attrdict.AttrDict],
    tokenizer: Any,
    text_column_name: str,
    label_column_name: str,
    label_to_id: Dict,
) -> Union[hf_datasets.Dataset, hf_datasets.DatasetDict]:
    padding = "max_length" if data_config.pad_to_max_length else False

    def tokenize_and_align_labels(examples, ):
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            padding=padding,
            truncation=True,
            # We use this argument because the texts in our dataset are lists of words
            # (with a label for each word).
            is_split_into_words=True,
        )
        labels = []
        for i, label in enumerate(examples[label_column_name]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they
                # are automatically ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label_to_id[label[word_idx]])
                # For the other tokens in a word, we set the label to either the current label
                # or -100, depending on the label_all_tokens flag.
                else:
                    label_ids.append(label_to_id[label[word_idx]]
                                     if data_config.label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    tokenized_datasets = raw_datasets.map(
        tokenize_and_align_labels,
        num_proc=data_config.preprocessing_num_workers,
        load_from_cache_file=not data_config.overwrite_cache,
        batched=True,
    )

    for _, data in tokenized_datasets.items():
        hf.remove_unused_columns(model, data)

    return tokenized_datasets
Example #3
0
def compute_metrics(
    data_config,
    column_names,
    post_processing_function,
    raw_datasets,
    tokenized_datasets,
    model,
    metric,
    predictions,
):
    inds, predictions = zip(*predictions)
    inds = np.hstack(inds)
    sorted_inds = np.argsort(inds)
    predictions = zip(*predictions)
    predictions = [utils.expand_like(p) for p in predictions]
    predictions = [p[sorted_inds] for p in predictions]

    # We need to add back in columns needed for validation.
    tokenized_datasets["validation"].set_format(
        type=tokenized_datasets["validation"].format["type"],
        columns=list(tokenized_datasets["validation"].features.keys()),
    )
    output = post_processing_function(
        examples=raw_datasets["validation"],
        features=tokenized_datasets["validation"],
        predictions=predictions,
        data_args=data_config,
        column_names=column_names,
        prefix="eval",
        model=model,
    )
    result = metric.compute(predictions=output.predictions,
                            references=output.label_ids)
    # Then remove them again so that data collation doesn't break.
    hf.remove_unused_columns(model, tokenized_datasets["validation"])
    return result
 def compute_metrics(predictions):
     predictions = zip(*predictions)
     predictions = [utils.expand_like(p) for p in predictions]
     # We need to add back in columns needed for validation.
     self.tokenized_datasets["validation"].set_format(
         type=self.tokenized_datasets["validation"].format["type"],
         columns=list(
             self.tokenized_datasets["validation"].features.keys()),
     )
     output = self.data_processors.post_processing_function(
         examples=self.raw_datasets["validation"],
         features=self.tokenized_datasets["validation"],
         predictions=predictions,
         data_args=self.data_config,
         column_names=self.column_names,
         prefix="eval",
         model=self.model,
     )
     result = metric.compute(predictions=output.predictions,
                             references=output.label_ids)
     # Then remove them again so that data collation doesn't break.
     hf.remove_unused_columns(self.model,
                              self.tokenized_datasets["validation"])
     return result
Example #5
0
    def build_datasets(self) -> Union[datasets.Dataset, datasets.DatasetDict]:
        # When using your own dataset or a different dataset from swag, you will probably need
        # to change this.
        ending_names = [f"ending{i}" for i in range(4)]
        context_name = "sent1"
        question_header_name = "sent2"

        padding = "max_length" if self.data_config.pad_to_max_length else False
        if self.data_config.max_seq_length is None:
            max_seq_length = self.tokenizer.model_max_length
            if max_seq_length > 1024:
                self.logger.warning(
                    "The tokenizer picked seems to have a very large `model_max_length` "
                    f"({self.tokenizer.model_max_length}). Using 1024 instead. You can change "
                    "that default value by setting max_seq_length in the experiment config."
                )
                max_seq_length = 1024
        else:
            if self.data_config.max_seq_length > self.tokenizer.model_max_length:
                self.logger.warning(
                    f"The max_seq_length passed ({self.data_config.max_seq_length}) is larger "
                    f"than the maximum length for the model ({self.tokenizer.model_max_length}). "
                    f"Using max_seq_length={self.tokenizer.model_max_length}.")
            max_seq_length = min(self.data_config.max_seq_length,
                                 self.tokenizer.model_max_length)

        # We cannot use self.tokenizer as a non-local variable in the preprocess_function if we
        # want map to be able to cache the output of the tokenizer.  Hence, the preprocess_function
        # takes a tokenizer explicitly as an input and we create a closure using functools.partial.
        def preprocess_function(tokenizer, padding, max_seq_length, examples):
            first_sentences = [[context] * 4
                               for context in examples[context_name]]
            question_headers = examples[question_header_name]
            second_sentences = [[
                f"{header} {examples[end][i]}" for end in ending_names
            ] for i, header in enumerate(question_headers)]

            # Flatten out
            first_sentences = sum(first_sentences, [])
            second_sentences = sum(second_sentences, [])

            # Tokenize
            tokenized_examples = tokenizer(
                first_sentences,
                second_sentences,
                truncation=True,
                max_length=max_seq_length,
                padding=padding,
            )
            # Un-flatten
            return {
                k: [v[i:i + 4] for i in range(0, len(v), 4)]
                for k, v in tokenized_examples.items()
            }

        tokenized_datasets = self.raw_datasets.map(
            functools.partial(preprocess_function, self.tokenizer, padding,
                              max_seq_length),
            batched=True,
            num_proc=self.data_config.preprocessing_num_workers,
            load_from_cache_file=not self.data_config.overwrite_cache,
        )
        for _, data in tokenized_datasets.items():
            hf.remove_unused_columns(self.model, data)

        # Data collator
        self.collator = (transformers.default_data_collator
                         if self.data_config.pad_to_max_length else
                         DataCollatorForMultipleChoice(
                             tokenizer=self.tokenizer))
        return tokenized_datasets
Example #6
0
    def build_datasets(self) -> Union[datasets.Dataset, datasets.DatasetDict]:
        # Preprocessing the datasets
        if self.hparams.finetuning_task is not None:
            sentence1_key, sentence2_key = task_to_keys[
                self.hparams.finetuning_task]
        else:
            # We try to have some nice defaults but don't hesitate to tweak to your use case.
            non_label_column_names = [
                name for name in self.raw_datasets["train"].column_names
                if name != "label"
            ]
            if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
                sentence1_key, sentence2_key = "sentence1", "sentence2"
            else:
                if len(non_label_column_names) >= 2:
                    sentence1_key, sentence2_key = non_label_column_names[:2]
                else:
                    sentence1_key, sentence2_key = non_label_column_names[
                        0], None

        # Padding strategy
        if self.data_config.pad_to_max_length:
            padding = "max_length"
        else:
            # We will pad later, dynamically at batch creation to the max_seq_length in each batch.
            padding = False

        # Some models have set the order of the labels to use, so let's make sure we do use it.
        label_to_id = None
        if (self.model.config.label2id != transformers.PretrainedConfig(
                num_labels=self.hparams.num_labels).label2id
                and self.hparams.finetuning_task is not None
                and not self.is_regression):
            # Some have all caps in their config, some don't.
            label_name_to_id = {
                k.lower(): v
                for k, v in self.model.config.label2id.items()
            }
            if sorted(label_name_to_id.keys()) == sorted(self.label_list):
                label_to_id = {
                    i: label_name_to_id[self.label_list[i]]
                    for i in range(self.hparams.num_labels)
                }
            else:
                self.logger.warning(
                    "Your model seems to have been trained with labels, but they don't match the "
                    f"dataset: model labels: {sorted(label_name_to_id.keys())}, "
                    f"dataset labels: {sorted(self.label_list)}."
                    "\nIgnoring the model labels as a result.", )
        elif self.hparams.finetuning_task is None and not self.is_regression:
            label_to_id = {v: i for i, v in enumerate(self.label_list)}

        if self.data_config.max_seq_length > self.tokenizer.model_max_length:
            self.logger.warning(
                f"The max_seq_length passed ({self.data_config.max_seq_length}) is larger than "
                f"the maximum length for the model ({self.tokenizer.model_max_length}). Using "
                f"max_seq_length={self.tokenizer.model_max_length}.")
        max_seq_length = min(self.data_config.max_seq_length,
                             self.tokenizer.model_max_length)

        # We cannot use self.tokenizer as a non-local variable in the preprocess_function if we
        # want map to be able to cache the output of the tokenizer.  Hence, the preprocess_function
        # takes a tokenizer explicitly as an input and we create a closure using functools.partial.
        def preprocess_function(tokenizer, padding, max_seq_length, examples):
            # Tokenize the texts
            args = ((examples[sentence1_key], ) if sentence2_key is None else
                    (examples[sentence1_key], examples[sentence2_key]))
            result = tokenizer(*args,
                               padding=padding,
                               max_length=max_seq_length,
                               truncation=True)

            # Map labels to IDs (not necessary for GLUE tasks)
            if label_to_id is not None and "label" in examples:
                result["label"] = [
                    label_to_id[label] for label in examples["label"]
                ]
            return result

        tokenized_datasets = self.raw_datasets.map(
            functools.partial(preprocess_function, self.tokenizer, padding,
                              max_seq_length),
            batched=True,
            load_from_cache_file=not self.data_config.overwrite_cache,
        )
        for _, data in tokenized_datasets.items():
            hf.remove_unused_columns(self.model, data)

        # Data collator will default to DataCollatorWithPadding, so we change it if we already
        # did the padding.
        if self.data_config.pad_to_max_length:
            self.collator = transformers.default_data_collator
        elif self.hparams.use_apex_amp:
            collator = transformers.DataCollatorWithPadding(
                self.tokenizer, pad_to_multiple_of=8)
            self.collator = lambda x: collator(x).data
        else:
            self.collator = None
        return tokenized_datasets
    def build_datasets(self) -> Union[datasets.Dataset, datasets.DatasetDict]:
        column_names = self.raw_datasets["train"].column_names
        text_column_name = "text" if "text" in column_names else column_names[0]

        def tokenize_function(tokenizer, examples):
            return tokenizer(examples[text_column_name])

        # We cannot use self.tokenizer as a non-local variable in the tokenize_function if we want
        # map to be able to cache the output of the tokenizer.  Hence, the tokenize_function takes
        # a tokenizer explicitly as an input and we create a closure using functools.partial.
        tokenized_datasets = self.raw_datasets.map(
            functools.partial(tokenize_function, self.tokenizer),
            batched=True,
            num_proc=self.data_config.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not self.data_config.overwrite_cache,
        )

        if self.data_config.max_seq_length is None:
            max_seq_length = self.tokenizer.model_max_length
            if max_seq_length > 1024:
                self.logger.warning(
                    "The tokenizer picked seems to have a very large `model_max_length` "
                    f"({self.tokenizer.model_max_length}). Using 1024 instead. You can change "
                    "that default value by setting max_seq_length in the experiment config."
                )
            max_seq_length = 1024
        else:
            if self.data_config.max_seq_length > self.tokenizer.model_max_length:
                self.logger.warning(
                    f"The max_seq_length passed ({self.data_config.max_seq_length}) is larger "
                    f"than the maximum length for the model ({self.tokenizer.model_max_length}). "
                    f"Using max_seq_length={self.tokenizer.model_max_length}."
                )
            max_seq_length = min(self.data_config.max_seq_length, self.tokenizer.model_max_length)

        # Main data processing function that will concatenate all texts from our dataset and
        # generate chunks of max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead
            # of this drop, you can customize this part to your needs.
            total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
                for k, t in concatenated_examples.items()
            }
            result["labels"] = result["input_ids"].copy()
            return result

        # Note that with `batched=True`, this map processes 1,000 texts together, so
        # group_texts throws away a remainder for each of those groups of 1,000 texts.
        # You can adjust that batch_size here but a higher value might be slower to preprocess.
        #
        # To speed up this part, we use multiprocessing. See the documentation of the map
        # method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html
        lm_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=self.data_config.preprocessing_num_workers,
            load_from_cache_file=not self.data_config.overwrite_cache,
        )
        for _, data in tokenized_datasets.items():
            hf.remove_unused_columns(self.model, data)

        self.collator = transformers.default_data_collator

        return lm_datasets
Example #8
0
    def build_datasets(self) -> Union[datasets.Dataset, datasets.DatasetDict]:
        column_names = self.raw_datasets["train"].column_names
        text_column_name = "text" if "text" in column_names else column_names[0]

        if self.data_config.max_seq_length > self.tokenizer.model_max_length:
            self.logger.warning(
                f"The max_seq_length passed ({self.data_config.max_seq_length}) is larger "
                f"than the maximum length for the model ({self.tokenizer.model_max_length}). "
                f"Using max_seq_length={self.tokenizer.model_max_length}.")
        max_seq_length = min(self.data_config.max_seq_length,
                             self.tokenizer.model_max_length)

        # We cannot use self.tokenizer as a non-local variable in the tokenize_function if we want
        # map to be able to cache the output of the tokenizer.  Hence, the tokenize_function takes
        # a tokenizer explicitly as an input and we create a closure using functools.partial.
        if self.data_config.line_by_line:
            # When using line_by_line, we just tokenize each nonempty line.
            padding = "max_length" if self.data_config.pad_to_max_length else False

            def tokenize_function(tokenizer, padding, max_seq_length,
                                  examples):
                # Remove empty lines
                examples["text"] = [
                    line for line in examples["text"]
                    if len(line) > 0 and not line.isspace()
                ]
                return tokenizer(examples["text"],
                                 padding=padding,
                                 truncation=True,
                                 max_length=max_seq_length)

            tokenized_datasets = self.raw_datasets.map(
                functools.partial(tokenize_function, self.tokenizer, padding,
                                  max_seq_length),
                batched=True,
                num_proc=self.data_config.preprocessing_num_workers,
                remove_columns=[text_column_name],
                load_from_cache_file=not self.data_config.overwrite_cache,
            )
        else:
            # Otherwise, we tokenize every text, then concatenate them together before splitting
            # them in smaller parts.
            def tokenize_function(tokenizer, examples):
                return tokenizer(examples[text_column_name])

            tokenized_datasets = self.raw_datasets.map(
                functools.partial(tokenize_function, self.tokenizer),
                batched=True,
                num_proc=self.data_config.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not self.data_config.overwrite_cache,
            )

            # Main data processing function that will concatenate all texts from our dataset and
            # generate chunks of max_seq_length.
            def group_texts(examples):
                # Concatenate all texts.
                concatenated_examples = {
                    k: sum(examples[k], [])
                    for k in examples.keys()
                }
                total_length = len(concatenated_examples[list(
                    examples.keys())[0]])
                # We drop the small remainder, we could add padding if the model supported it
                # instead of this drop, you can customize this part to your needs.
                total_length = (total_length //
                                max_seq_length) * max_seq_length
                # Split by chunks of max_len.
                result = {
                    k: [
                        t[i:i + max_seq_length]
                        for i in range(0, total_length, max_seq_length)
                    ]
                    for k, t in concatenated_examples.items()
                }
                return result

            # Note that with `batched=True`, this map processes 1,000 texts together, so
            # group_texts throws away a remainder for each of those groups of 1,000 texts.
            # You can adjust that batch_size here but a higher value might be slower to preprocess.
            #
            # To speed up this part, we use multiprocessing. See the documentation of the map
            # method for more information:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html
            tokenized_datasets = tokenized_datasets.map(
                group_texts,
                batched=True,
                num_proc=self.data_config.preprocessing_num_workers,
                load_from_cache_file=not self.data_config.overwrite_cache,
            )
        for _, data in tokenized_datasets.items():
            hf.remove_unused_columns(self.model, data)

        self.collator = transformers.DataCollatorForPermutationLanguageModeling(
            tokenizer=self.tokenizer,
            plm_probability=self.data_config.plm_probability,
            max_span_length=self.data_config.max_span_length,
        )
        return tokenized_datasets