def build_datasets( self) -> Dict[str, Union[datasets.Dataset, datasets.DatasetDict]]: tokenized_datasets = {} for split in ["train", "validation"]: tokenized_datasets[split] = self.raw_datasets[split].map( functools.partial( self.data_processors.prepare_features, split, self.data_config, self.tokenizer, self.column_names, ), batched=True, num_proc=self.data_config.preprocessing_num_workers, remove_columns=self.column_names, load_from_cache_file=not self.data_config.overwrite_cache, ) hf.remove_unused_columns(self.model, tokenized_datasets[split]) if self.data_config.pad_to_max_length: self.collator = transformers.default_data_collator else: collator = transformers.DataCollatorWithPadding( self.tokenizer, pad_to_multiple_of=8 if self.hparams.use_apex_amp else None) self.collator = lambda x: collator(x).data return tokenized_datasets
def build_datasets( self) -> Dict[str, Union[datasets.Dataset, datasets.DatasetDict]]: if self.data_config.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation to the max_seq_length in each batch. padding = False # We cannot use self.tokenizer as a non-local variable in the preprocess_function if we # want map to be able to cache the output of the tokenizer. Hence, the preprocess_function # takes a tokenizer explicitly as an input and we create a closure using functools.partial. def preprocess_function(tokenizer, padding, max_length, examples): # Tokenize the texts return tokenizer( examples["premise"], examples["hypothesis"], padding=padding, max_length=max_length, truncation=True, ) train_dataset = self.raw_datasets["train"].map( functools.partial(preprocess_function, self.tokenizer, padding, self.data_config.max_seq_length), batched=True, load_from_cache_file=not self.data_config.overwrite_cache, ) eval_dataset = self.raw_datasets["validation"].map( functools.partial(preprocess_function, self.tokenizer, padding, self.data_config.max_seq_length), batched=True, load_from_cache_file=not self.data_config.overwrite_cache, ) if self.data_config.pad_to_max_length: self.collator = transformers.default_data_collator else: collator = transformers.DataCollatorWithPadding( self.tokenizer, pad_to_multiple_of=8 if self.hparams.use_apex_amp else None) self.collator = lambda x: collator(x).data return {"train": train_dataset, "validation": eval_dataset}
def build_datasets(self) -> Union[datasets.Dataset, datasets.DatasetDict]: # Preprocessing the datasets if self.hparams.finetuning_task is not None: sentence1_key, sentence2_key = task_to_keys[ self.hparams.finetuning_task] else: # We try to have some nice defaults but don't hesitate to tweak to your use case. non_label_column_names = [ name for name in self.raw_datasets["train"].column_names if name != "label" ] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[ 0], None # Padding strategy if self.data_config.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation to the max_seq_length in each batch. padding = False # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if (self.model.config.label2id != transformers.PretrainedConfig( num_labels=self.hparams.num_labels).label2id and self.hparams.finetuning_task is not None and not self.is_regression): # Some have all caps in their config, some don't. label_name_to_id = { k.lower(): v for k, v in self.model.config.label2id.items() } if sorted(label_name_to_id.keys()) == sorted(self.label_list): label_to_id = { i: label_name_to_id[self.label_list[i]] for i in range(self.hparams.num_labels) } else: self.logger.warning( "Your model seems to have been trained with labels, but they don't match the " f"dataset: model labels: {sorted(label_name_to_id.keys())}, " f"dataset labels: {sorted(self.label_list)}." "\nIgnoring the model labels as a result.", ) elif self.hparams.finetuning_task is None and not self.is_regression: label_to_id = {v: i for i, v in enumerate(self.label_list)} if self.data_config.max_seq_length > self.tokenizer.model_max_length: self.logger.warning( f"The max_seq_length passed ({self.data_config.max_seq_length}) is larger than " f"the maximum length for the model ({self.tokenizer.model_max_length}). Using " f"max_seq_length={self.tokenizer.model_max_length}.") max_seq_length = min(self.data_config.max_seq_length, self.tokenizer.model_max_length) # We cannot use self.tokenizer as a non-local variable in the preprocess_function if we # want map to be able to cache the output of the tokenizer. Hence, the preprocess_function # takes a tokenizer explicitly as an input and we create a closure using functools.partial. def preprocess_function(tokenizer, padding, max_seq_length, examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) # Map labels to IDs (not necessary for GLUE tasks) if label_to_id is not None and "label" in examples: result["label"] = [ label_to_id[label] for label in examples["label"] ] return result tokenized_datasets = self.raw_datasets.map( functools.partial(preprocess_function, self.tokenizer, padding, max_seq_length), batched=True, load_from_cache_file=not self.data_config.overwrite_cache, ) for _, data in tokenized_datasets.items(): hf.remove_unused_columns(self.model, data) # Data collator will default to DataCollatorWithPadding, so we change it if we already # did the padding. if self.data_config.pad_to_max_length: self.collator = transformers.default_data_collator elif self.hparams.use_apex_amp: collator = transformers.DataCollatorWithPadding( self.tokenizer, pad_to_multiple_of=8) self.collator = lambda x: collator(x).data else: self.collator = None return tokenized_datasets
logging_dir=f"{_dir}/logging", logging_steps=256, dataloader_num_workers=64, evaluation_strategy="steps", eval_steps=256, save_steps=256, fp16=True, fp16_opt_level="O3", learning_rate=5e-4, run_name=_dir, ) model = transformers.AlbertForSequenceClassification.from_pretrained( "albert-large-v2", num_labels=2) tokenizer = transformers.AlbertTokenizerFast.from_pretrained("albert-large-v2") data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=32) trainer = transformers.Trainer( args=args, model=model, tokenizer=tokenizer, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=test_dataset, compute_metrics=compute_metrics, ) # In[ ]: trainer.train()