def map(dataset: nlp.Dataset, **kwargs): _ = dataset.map(**kwargs)
def train( self, training_args: TrainingArguments, train_dataset: nlp.Dataset, eval_dataset: nlp.Dataset, text_col_nm: str = "text", label_col_nm: str = "label", compute_metrics: Callable = None, ) -> None: """Trains and/or finetunes the sequence classification model * **training_args** - Transformers `TrainingArguments` object model * **train_dataset** - Training `Dataset` class object from the nlp library * **eval_dataset** - Eval `Dataset` class object from the nlp library * **text_col_nm** - Name of the text feature column used as training data (Default "text") * **label_col_nm** - Name of the label feature column (Default "label") * **compute_metrics** - Custom metrics function callable for `transformers.Trainer`'s compute metrics * **return** - None """ # Set default metrics if None if not compute_metrics: compute_metrics = self._default_metrics # Set nlp.Dataset label values in sequence classifier configuration ## Important NOTE: Updating configurations do not update the sequence classification head module layer ## We are manually initializing a new linear layer for the "new" labels being trained class_label = train_dataset.features[label_col_nm] config_data = { "num_labels": class_label.num_classes, "id2label": {v: n for v, n in enumerate(class_label.names)}, "label2id": {n: v for v, n in enumerate(class_label.names)}, } self.model.config.update(config_data) self._mutate_model_head(class_label=class_label) # Batch map datasets as torch tensors with tokenizer def tokenize(batch): return self.tokenizer(batch[text_col_nm], padding=True, truncation=True) train_dataset = train_dataset.map( tokenize, batch_size=len(train_dataset), batched=True ) eval_dataset = eval_dataset.map( tokenize, batch_size=len(eval_dataset), batched=True ) train_dataset.set_format( "torch", columns=["input_ids", "attention_mask", label_col_nm] ) eval_dataset.set_format( "torch", columns=["input_ids", "attention_mask", label_col_nm] ) # Instantiate transformers trainer self.trainer = Trainer( model=self.model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Train and serialize self.trainer.train() self.trainer.save_model() self.tokenizer.save_pretrained(training_args.output_dir)