Exemple #1
0
 def build_datasets(
         self) -> Dict[str, Union[datasets.Dataset, datasets.DatasetDict]]:
     tokenized_datasets = {}
     for split in ["train", "validation"]:
         tokenized_datasets[split] = self.raw_datasets[split].map(
             functools.partial(
                 self.data_processors.prepare_features,
                 split,
                 self.data_config,
                 self.tokenizer,
                 self.column_names,
             ),
             batched=True,
             num_proc=self.data_config.preprocessing_num_workers,
             remove_columns=self.column_names,
             load_from_cache_file=not self.data_config.overwrite_cache,
         )
         hf.remove_unused_columns(self.model, tokenized_datasets[split])
     if self.data_config.pad_to_max_length:
         self.collator = transformers.default_data_collator
     else:
         collator = transformers.DataCollatorWithPadding(
             self.tokenizer,
             pad_to_multiple_of=8 if self.hparams.use_apex_amp else None)
         self.collator = lambda x: collator(x).data
     return tokenized_datasets
Exemple #2
0
    def build_datasets(
            self) -> Dict[str, Union[datasets.Dataset, datasets.DatasetDict]]:
        if self.data_config.pad_to_max_length:
            padding = "max_length"
        else:
            # We will pad later, dynamically at batch creation to the max_seq_length in each batch.
            padding = False

        # We cannot use self.tokenizer as a non-local variable in the preprocess_function if we
        # want map to be able to cache the output of the tokenizer.  Hence, the preprocess_function
        # takes a tokenizer explicitly as an input and we create a closure using functools.partial.
        def preprocess_function(tokenizer, padding, max_length, examples):
            # Tokenize the texts
            return tokenizer(
                examples["premise"],
                examples["hypothesis"],
                padding=padding,
                max_length=max_length,
                truncation=True,
            )

        train_dataset = self.raw_datasets["train"].map(
            functools.partial(preprocess_function, self.tokenizer, padding,
                              self.data_config.max_seq_length),
            batched=True,
            load_from_cache_file=not self.data_config.overwrite_cache,
        )
        eval_dataset = self.raw_datasets["validation"].map(
            functools.partial(preprocess_function, self.tokenizer, padding,
                              self.data_config.max_seq_length),
            batched=True,
            load_from_cache_file=not self.data_config.overwrite_cache,
        )

        if self.data_config.pad_to_max_length:
            self.collator = transformers.default_data_collator
        else:
            collator = transformers.DataCollatorWithPadding(
                self.tokenizer,
                pad_to_multiple_of=8 if self.hparams.use_apex_amp else None)
            self.collator = lambda x: collator(x).data

        return {"train": train_dataset, "validation": eval_dataset}
Exemple #3
0
    def build_datasets(self) -> Union[datasets.Dataset, datasets.DatasetDict]:
        # Preprocessing the datasets
        if self.hparams.finetuning_task is not None:
            sentence1_key, sentence2_key = task_to_keys[
                self.hparams.finetuning_task]
        else:
            # We try to have some nice defaults but don't hesitate to tweak to your use case.
            non_label_column_names = [
                name for name in self.raw_datasets["train"].column_names
                if name != "label"
            ]
            if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
                sentence1_key, sentence2_key = "sentence1", "sentence2"
            else:
                if len(non_label_column_names) >= 2:
                    sentence1_key, sentence2_key = non_label_column_names[:2]
                else:
                    sentence1_key, sentence2_key = non_label_column_names[
                        0], None

        # Padding strategy
        if self.data_config.pad_to_max_length:
            padding = "max_length"
        else:
            # We will pad later, dynamically at batch creation to the max_seq_length in each batch.
            padding = False

        # Some models have set the order of the labels to use, so let's make sure we do use it.
        label_to_id = None
        if (self.model.config.label2id != transformers.PretrainedConfig(
                num_labels=self.hparams.num_labels).label2id
                and self.hparams.finetuning_task is not None
                and not self.is_regression):
            # Some have all caps in their config, some don't.
            label_name_to_id = {
                k.lower(): v
                for k, v in self.model.config.label2id.items()
            }
            if sorted(label_name_to_id.keys()) == sorted(self.label_list):
                label_to_id = {
                    i: label_name_to_id[self.label_list[i]]
                    for i in range(self.hparams.num_labels)
                }
            else:
                self.logger.warning(
                    "Your model seems to have been trained with labels, but they don't match the "
                    f"dataset: model labels: {sorted(label_name_to_id.keys())}, "
                    f"dataset labels: {sorted(self.label_list)}."
                    "\nIgnoring the model labels as a result.", )
        elif self.hparams.finetuning_task is None and not self.is_regression:
            label_to_id = {v: i for i, v in enumerate(self.label_list)}

        if self.data_config.max_seq_length > self.tokenizer.model_max_length:
            self.logger.warning(
                f"The max_seq_length passed ({self.data_config.max_seq_length}) is larger than "
                f"the maximum length for the model ({self.tokenizer.model_max_length}). Using "
                f"max_seq_length={self.tokenizer.model_max_length}.")
        max_seq_length = min(self.data_config.max_seq_length,
                             self.tokenizer.model_max_length)

        # We cannot use self.tokenizer as a non-local variable in the preprocess_function if we
        # want map to be able to cache the output of the tokenizer.  Hence, the preprocess_function
        # takes a tokenizer explicitly as an input and we create a closure using functools.partial.
        def preprocess_function(tokenizer, padding, max_seq_length, examples):
            # Tokenize the texts
            args = ((examples[sentence1_key], ) if sentence2_key is None else
                    (examples[sentence1_key], examples[sentence2_key]))
            result = tokenizer(*args,
                               padding=padding,
                               max_length=max_seq_length,
                               truncation=True)

            # Map labels to IDs (not necessary for GLUE tasks)
            if label_to_id is not None and "label" in examples:
                result["label"] = [
                    label_to_id[label] for label in examples["label"]
                ]
            return result

        tokenized_datasets = self.raw_datasets.map(
            functools.partial(preprocess_function, self.tokenizer, padding,
                              max_seq_length),
            batched=True,
            load_from_cache_file=not self.data_config.overwrite_cache,
        )
        for _, data in tokenized_datasets.items():
            hf.remove_unused_columns(self.model, data)

        # Data collator will default to DataCollatorWithPadding, so we change it if we already
        # did the padding.
        if self.data_config.pad_to_max_length:
            self.collator = transformers.default_data_collator
        elif self.hparams.use_apex_amp:
            collator = transformers.DataCollatorWithPadding(
                self.tokenizer, pad_to_multiple_of=8)
            self.collator = lambda x: collator(x).data
        else:
            self.collator = None
        return tokenized_datasets
Exemple #4
0
    logging_dir=f"{_dir}/logging",
    logging_steps=256,
    dataloader_num_workers=64,
    evaluation_strategy="steps",
    eval_steps=256,
    save_steps=256,
    fp16=True,
    fp16_opt_level="O3",
    learning_rate=5e-4,
    run_name=_dir,
)

model = transformers.AlbertForSequenceClassification.from_pretrained(
    "albert-large-v2", num_labels=2)
tokenizer = transformers.AlbertTokenizerFast.from_pretrained("albert-large-v2")
data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer,
                                                     pad_to_multiple_of=32)
trainer = transformers.Trainer(
    args=args,
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# In[ ]:

trainer.train()