Exemple #1
0
def map(dataset: nlp.Dataset, **kwargs):
    _ = dataset.map(**kwargs)
Exemple #2
0
    def train(
        self,
        training_args: TrainingArguments,
        train_dataset: nlp.Dataset,
        eval_dataset: nlp.Dataset,
        text_col_nm: str = "text",
        label_col_nm: str = "label",
        compute_metrics: Callable = None,
    ) -> None:
        """Trains and/or finetunes the sequence classification model

        * **training_args** - Transformers `TrainingArguments` object model
        * **train_dataset** - Training `Dataset` class object from the nlp library
        * **eval_dataset** - Eval `Dataset` class object from the nlp library
        * **text_col_nm** - Name of the text feature column used as training data (Default "text")
        * **label_col_nm** - Name of the label feature column (Default "label")
        * **compute_metrics** - Custom metrics function callable for `transformers.Trainer`'s compute metrics
        * **return** - None
        """
        # Set default metrics if None
        if not compute_metrics:
            compute_metrics = self._default_metrics

        # Set nlp.Dataset label values in sequence classifier configuration
        ## Important NOTE: Updating configurations do not update the sequence classification head module layer
        ## We are manually initializing a new linear layer for the "new" labels being trained
        class_label = train_dataset.features[label_col_nm]
        config_data = {
            "num_labels": class_label.num_classes,
            "id2label": {v: n for v, n in enumerate(class_label.names)},
            "label2id": {n: v for v, n in enumerate(class_label.names)},
        }
        self.model.config.update(config_data)
        self._mutate_model_head(class_label=class_label)

        # Batch map datasets as torch tensors with tokenizer
        def tokenize(batch):
            return self.tokenizer(batch[text_col_nm], padding=True, truncation=True)

        train_dataset = train_dataset.map(
            tokenize, batch_size=len(train_dataset), batched=True
        )
        eval_dataset = eval_dataset.map(
            tokenize, batch_size=len(eval_dataset), batched=True
        )
        train_dataset.set_format(
            "torch", columns=["input_ids", "attention_mask", label_col_nm]
        )
        eval_dataset.set_format(
            "torch", columns=["input_ids", "attention_mask", label_col_nm]
        )

        # Instantiate transformers trainer
        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
        )

        # Train and serialize
        self.trainer.train()
        self.trainer.save_model()
        self.tokenizer.save_pretrained(training_args.output_dir)