def load_data( self, data: Tuple[str, Union[str, List[str]], Union[str, List[str]]], dataset: Optional[Any] = None, columns: Union[List[str], Tuple[str]] = ("input_ids", "attention_mask", "labels"), ) -> Union[Sequence[Mapping[str, Any]]]: csv_file, input, target = data data_files = {} stage = self.running_stage.value data_files[stage] = str(csv_file) # FLASH_TESTING is set in the CI to run faster. # FLASH_TESTING is set in the CI to run faster. if flash._IS_TESTING and not torch.cuda.is_available(): try: dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f'{stage}[:20]'])[0] }) except Exception: dataset_dict = load_dataset(self.filetype, data_files=data_files) else: dataset_dict = load_dataset(self.filetype, data_files=data_files) if self.training: labels = list(sorted(list(set(dataset_dict[stage][target])))) dataset.num_classes = len(labels) self.set_state(LabelsState(labels)) labels = self.get_state(LabelsState) # convert labels to ids # if not self.predicting: if labels is not None: labels = labels.labels label_to_class_mapping = {v: k for k, v in enumerate(labels)} dataset_dict = dataset_dict.map( partial(self._transform_label, label_to_class_mapping, target)) dataset_dict = dataset_dict.map(partial(self._tokenize_fn, input=input), batched=True) # Hugging Face models expect target to be named ``labels``. if not self.predicting and target != "labels": dataset_dict.rename_column_(target, "labels") dataset_dict.set_format("torch", columns=columns) return dataset_dict[stage]
def load_data(self, filepath: str, dataset: AutoDataset, columns: Union[List[str], Tuple[str]] = ("input_ids", "attention_mask", "labels"), use_full: bool = True): data_files = {} stage = dataset.running_stage.value data_files[stage] = str(filepath) # FLASH_TESTING is set in the CI to run faster. if use_full and os.getenv("FLASH_TESTING", "0") == "0": dataset_dict = load_dataset(self.filetype, data_files=data_files) else: # used for debugging. Avoid processing the entire dataset # noqa E265 dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f'{stage}[:20]'])[0] }) dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True) # convert labels to ids if not self.predicting: dataset_dict = dataset_dict.map(self._transform_label) dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True) # Hugging Face models expect target to be named ``labels``. if not self.predicting and self.target != "labels": dataset_dict.rename_column_(self.target, "labels") dataset_dict.set_format("torch", columns=columns) if not self.predicting: dataset.num_classes = len(self.label_to_class_mapping) return dataset_dict[stage]