def build_datasets( self) -> Dict[str, Union[datasets.Dataset, datasets.DatasetDict]]: tokenized_datasets = {} for split in ["train", "validation"]: tokenized_datasets[split] = self.raw_datasets[split].map( functools.partial( self.data_processors.prepare_features, split, self.data_config, self.tokenizer, self.column_names, ), batched=True, num_proc=self.data_config.preprocessing_num_workers, remove_columns=self.column_names, load_from_cache_file=not self.data_config.overwrite_cache, ) hf.remove_unused_columns(self.model, tokenized_datasets[split]) if self.data_config.pad_to_max_length: self.collator = transformers.default_data_collator else: collator = transformers.DataCollatorWithPadding( self.tokenizer, pad_to_multiple_of=8 if self.hparams.use_apex_amp else None) self.collator = lambda x: collator(x).data return tokenized_datasets
def build_tokenized_datasets( raw_datasets: Union[hf_datasets.DatasetDict, hf_datasets.Dataset], model: torch.nn.Module, data_config: Union[Dict, attrdict.AttrDict], tokenizer: Any, text_column_name: str, label_column_name: str, label_to_id: Dict, ) -> Union[hf_datasets.Dataset, hf_datasets.DatasetDict]: padding = "max_length" if data_config.pad_to_max_length else False def tokenize_and_align_labels(examples, ): tokenized_inputs = tokenizer( examples[text_column_name], padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words # (with a label for each word). is_split_into_words=True, ) labels = [] for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they # are automatically ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label # or -100, depending on the label_all_tokens flag. else: label_ids.append(label_to_id[label[word_idx]] if data_config.label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs tokenized_datasets = raw_datasets.map( tokenize_and_align_labels, num_proc=data_config.preprocessing_num_workers, load_from_cache_file=not data_config.overwrite_cache, batched=True, ) for _, data in tokenized_datasets.items(): hf.remove_unused_columns(model, data) return tokenized_datasets
def compute_metrics( data_config, column_names, post_processing_function, raw_datasets, tokenized_datasets, model, metric, predictions, ): inds, predictions = zip(*predictions) inds = np.hstack(inds) sorted_inds = np.argsort(inds) predictions = zip(*predictions) predictions = [utils.expand_like(p) for p in predictions] predictions = [p[sorted_inds] for p in predictions] # We need to add back in columns needed for validation. tokenized_datasets["validation"].set_format( type=tokenized_datasets["validation"].format["type"], columns=list(tokenized_datasets["validation"].features.keys()), ) output = post_processing_function( examples=raw_datasets["validation"], features=tokenized_datasets["validation"], predictions=predictions, data_args=data_config, column_names=column_names, prefix="eval", model=model, ) result = metric.compute(predictions=output.predictions, references=output.label_ids) # Then remove them again so that data collation doesn't break. hf.remove_unused_columns(model, tokenized_datasets["validation"]) return result
def compute_metrics(predictions): predictions = zip(*predictions) predictions = [utils.expand_like(p) for p in predictions] # We need to add back in columns needed for validation. self.tokenized_datasets["validation"].set_format( type=self.tokenized_datasets["validation"].format["type"], columns=list( self.tokenized_datasets["validation"].features.keys()), ) output = self.data_processors.post_processing_function( examples=self.raw_datasets["validation"], features=self.tokenized_datasets["validation"], predictions=predictions, data_args=self.data_config, column_names=self.column_names, prefix="eval", model=self.model, ) result = metric.compute(predictions=output.predictions, references=output.label_ids) # Then remove them again so that data collation doesn't break. hf.remove_unused_columns(self.model, self.tokenized_datasets["validation"]) return result
def build_datasets(self) -> Union[datasets.Dataset, datasets.DatasetDict]: # When using your own dataset or a different dataset from swag, you will probably need # to change this. ending_names = [f"ending{i}" for i in range(4)] context_name = "sent1" question_header_name = "sent2" padding = "max_length" if self.data_config.pad_to_max_length else False if self.data_config.max_seq_length is None: max_seq_length = self.tokenizer.model_max_length if max_seq_length > 1024: self.logger.warning( "The tokenizer picked seems to have a very large `model_max_length` " f"({self.tokenizer.model_max_length}). Using 1024 instead. You can change " "that default value by setting max_seq_length in the experiment config." ) max_seq_length = 1024 else: if self.data_config.max_seq_length > self.tokenizer.model_max_length: self.logger.warning( f"The max_seq_length passed ({self.data_config.max_seq_length}) is larger " f"than the maximum length for the model ({self.tokenizer.model_max_length}). " f"Using max_seq_length={self.tokenizer.model_max_length}.") max_seq_length = min(self.data_config.max_seq_length, self.tokenizer.model_max_length) # We cannot use self.tokenizer as a non-local variable in the preprocess_function if we # want map to be able to cache the output of the tokenizer. Hence, the preprocess_function # takes a tokenizer explicitly as an input and we create a closure using functools.partial. def preprocess_function(tokenizer, padding, max_seq_length, examples): first_sentences = [[context] * 4 for context in examples[context_name]] question_headers = examples[question_header_name] second_sentences = [[ f"{header} {examples[end][i]}" for end in ending_names ] for i, header in enumerate(question_headers)] # Flatten out first_sentences = sum(first_sentences, []) second_sentences = sum(second_sentences, []) # Tokenize tokenized_examples = tokenizer( first_sentences, second_sentences, truncation=True, max_length=max_seq_length, padding=padding, ) # Un-flatten return { k: [v[i:i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items() } tokenized_datasets = self.raw_datasets.map( functools.partial(preprocess_function, self.tokenizer, padding, max_seq_length), batched=True, num_proc=self.data_config.preprocessing_num_workers, load_from_cache_file=not self.data_config.overwrite_cache, ) for _, data in tokenized_datasets.items(): hf.remove_unused_columns(self.model, data) # Data collator self.collator = (transformers.default_data_collator if self.data_config.pad_to_max_length else DataCollatorForMultipleChoice( tokenizer=self.tokenizer)) return tokenized_datasets
def build_datasets(self) -> Union[datasets.Dataset, datasets.DatasetDict]: # Preprocessing the datasets if self.hparams.finetuning_task is not None: sentence1_key, sentence2_key = task_to_keys[ self.hparams.finetuning_task] else: # We try to have some nice defaults but don't hesitate to tweak to your use case. non_label_column_names = [ name for name in self.raw_datasets["train"].column_names if name != "label" ] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[ 0], None # Padding strategy if self.data_config.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation to the max_seq_length in each batch. padding = False # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if (self.model.config.label2id != transformers.PretrainedConfig( num_labels=self.hparams.num_labels).label2id and self.hparams.finetuning_task is not None and not self.is_regression): # Some have all caps in their config, some don't. label_name_to_id = { k.lower(): v for k, v in self.model.config.label2id.items() } if sorted(label_name_to_id.keys()) == sorted(self.label_list): label_to_id = { i: label_name_to_id[self.label_list[i]] for i in range(self.hparams.num_labels) } else: self.logger.warning( "Your model seems to have been trained with labels, but they don't match the " f"dataset: model labels: {sorted(label_name_to_id.keys())}, " f"dataset labels: {sorted(self.label_list)}." "\nIgnoring the model labels as a result.", ) elif self.hparams.finetuning_task is None and not self.is_regression: label_to_id = {v: i for i, v in enumerate(self.label_list)} if self.data_config.max_seq_length > self.tokenizer.model_max_length: self.logger.warning( f"The max_seq_length passed ({self.data_config.max_seq_length}) is larger than " f"the maximum length for the model ({self.tokenizer.model_max_length}). Using " f"max_seq_length={self.tokenizer.model_max_length}.") max_seq_length = min(self.data_config.max_seq_length, self.tokenizer.model_max_length) # We cannot use self.tokenizer as a non-local variable in the preprocess_function if we # want map to be able to cache the output of the tokenizer. Hence, the preprocess_function # takes a tokenizer explicitly as an input and we create a closure using functools.partial. def preprocess_function(tokenizer, padding, max_seq_length, examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) # Map labels to IDs (not necessary for GLUE tasks) if label_to_id is not None and "label" in examples: result["label"] = [ label_to_id[label] for label in examples["label"] ] return result tokenized_datasets = self.raw_datasets.map( functools.partial(preprocess_function, self.tokenizer, padding, max_seq_length), batched=True, load_from_cache_file=not self.data_config.overwrite_cache, ) for _, data in tokenized_datasets.items(): hf.remove_unused_columns(self.model, data) # Data collator will default to DataCollatorWithPadding, so we change it if we already # did the padding. if self.data_config.pad_to_max_length: self.collator = transformers.default_data_collator elif self.hparams.use_apex_amp: collator = transformers.DataCollatorWithPadding( self.tokenizer, pad_to_multiple_of=8) self.collator = lambda x: collator(x).data else: self.collator = None return tokenized_datasets
def build_datasets(self) -> Union[datasets.Dataset, datasets.DatasetDict]: column_names = self.raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] def tokenize_function(tokenizer, examples): return tokenizer(examples[text_column_name]) # We cannot use self.tokenizer as a non-local variable in the tokenize_function if we want # map to be able to cache the output of the tokenizer. Hence, the tokenize_function takes # a tokenizer explicitly as an input and we create a closure using functools.partial. tokenized_datasets = self.raw_datasets.map( functools.partial(tokenize_function, self.tokenizer), batched=True, num_proc=self.data_config.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not self.data_config.overwrite_cache, ) if self.data_config.max_seq_length is None: max_seq_length = self.tokenizer.model_max_length if max_seq_length > 1024: self.logger.warning( "The tokenizer picked seems to have a very large `model_max_length` " f"({self.tokenizer.model_max_length}). Using 1024 instead. You can change " "that default value by setting max_seq_length in the experiment config." ) max_seq_length = 1024 else: if self.data_config.max_seq_length > self.tokenizer.model_max_length: self.logger.warning( f"The max_seq_length passed ({self.data_config.max_seq_length}) is larger " f"than the maximum length for the model ({self.tokenizer.model_max_length}). " f"Using max_seq_length={self.tokenizer.model_max_length}." ) max_seq_length = min(self.data_config.max_seq_length, self.tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and # generate chunks of max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead # of this drop, you can customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result # Note that with `batched=True`, this map processes 1,000 texts together, so # group_texts throws away a remainder for each of those groups of 1,000 texts. # You can adjust that batch_size here but a higher value might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map # method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html lm_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=self.data_config.preprocessing_num_workers, load_from_cache_file=not self.data_config.overwrite_cache, ) for _, data in tokenized_datasets.items(): hf.remove_unused_columns(self.model, data) self.collator = transformers.default_data_collator return lm_datasets
def build_datasets(self) -> Union[datasets.Dataset, datasets.DatasetDict]: column_names = self.raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if self.data_config.max_seq_length > self.tokenizer.model_max_length: self.logger.warning( f"The max_seq_length passed ({self.data_config.max_seq_length}) is larger " f"than the maximum length for the model ({self.tokenizer.model_max_length}). " f"Using max_seq_length={self.tokenizer.model_max_length}.") max_seq_length = min(self.data_config.max_seq_length, self.tokenizer.model_max_length) # We cannot use self.tokenizer as a non-local variable in the tokenize_function if we want # map to be able to cache the output of the tokenizer. Hence, the tokenize_function takes # a tokenizer explicitly as an input and we create a closure using functools.partial. if self.data_config.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if self.data_config.pad_to_max_length else False def tokenize_function(tokenizer, padding, max_seq_length, examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length) tokenized_datasets = self.raw_datasets.map( functools.partial(tokenize_function, self.tokenizer, padding, max_seq_length), batched=True, num_proc=self.data_config.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not self.data_config.overwrite_cache, ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting # them in smaller parts. def tokenize_function(tokenizer, examples): return tokenizer(examples[text_column_name]) tokenized_datasets = self.raw_datasets.map( functools.partial(tokenize_function, self.tokenizer), batched=True, num_proc=self.data_config.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not self.data_config.overwrite_cache, ) # Main data processing function that will concatenate all texts from our dataset and # generate chunks of max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list( examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it # instead of this drop, you can customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so # group_texts throws away a remainder for each of those groups of 1,000 texts. # You can adjust that batch_size here but a higher value might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map # method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=self.data_config.preprocessing_num_workers, load_from_cache_file=not self.data_config.overwrite_cache, ) for _, data in tokenized_datasets.items(): hf.remove_unused_columns(self.model, data) self.collator = transformers.DataCollatorForPermutationLanguageModeling( tokenizer=self.tokenizer, plm_probability=self.data_config.plm_probability, max_span_length=self.data_config.max_span_length, ) return tokenized_datasets