def test_autodataset_smoke(): num_samples = 20 dt = range(num_samples) ds = DataSource() dset = AutoDataset(data=dt, data_source=ds, running_stage=RunningStage.TRAINING) assert dset is not None assert dset.running_stage == RunningStage.TRAINING # check on members assert dset.data == dt assert dset.data_source == ds # test set the running stage dset.running_stage = RunningStage.PREDICTING assert dset.running_stage == RunningStage.PREDICTING # check on methods assert dset.load_sample is not None assert dset.load_sample == ds.load_sample # check getters assert len(dset) == num_samples assert dset[0] == 0 assert dset[9] == 9 assert dset[11] == 11
def generate_dataset( self, data: Optional[DATA_TYPE], running_stage: RunningStage, ) -> Optional[Union[AutoDataset, IterableAutoDataset]]: is_none = data is None if isinstance(data, Sequence): is_none = data[0] is None if not is_none: from flash.data.data_pipeline import DataPipeline mock_dataset = typing.cast(AutoDataset, MockDataset()) with CurrentRunningStageFuncContext(running_stage, "load_data", self): load_data: Callable[[DATA_TYPE, Optional[Any]], Any] = getattr( self, DataPipeline._resolve_function_hierarchy( "load_data", self, running_stage, DataSource, ) ) parameters = signature(load_data).parameters if len(parameters) > 1 and "dataset" in parameters: # TODO: This was DATASET_KEY before data = load_data(data, mock_dataset) else: data = load_data(data) if has_len(data): dataset = AutoDataset(data, self, running_stage) else: dataset = IterableAutoDataset(data, self, running_stage) dataset.__dict__.update(mock_dataset.metadata) return dataset
def autogenerate_dataset( cls, data: Any, running_stage: RunningStage, whole_data_load_fn: Optional[Callable] = None, per_sample_load_fn: Optional[Callable] = None, data_pipeline: Optional[DataPipeline] = None, ) -> AutoDataset: """ This function is used to generate an ``AutoDataset`` from a ``DataPipeline`` if provided or from the provided ``whole_data_load_fn``, ``per_sample_load_fn`` functions directly """ if whole_data_load_fn is None: whole_data_load_fn = getattr( cls.preprocess_cls, DataPipeline._resolve_function_hierarchy( 'load_data', cls.preprocess_cls, running_stage, Preprocess)) if per_sample_load_fn is None: per_sample_load_fn = getattr( cls.preprocess_cls, DataPipeline._resolve_function_hierarchy( 'load_sample', cls.preprocess_cls, running_stage, Preprocess)) return AutoDataset(data, whole_data_load_fn, per_sample_load_fn, data_pipeline, running_stage=running_stage)
def test_autodataset_with_functions( with_dataset: bool, with_running_stage: bool, ): functions = _AutoDatasetTestPreprocess(with_dataset) load_sample_func = functions.load_sample load_data_func = functions.load_data if with_running_stage: running_stage = RunningStage.TRAINING else: running_stage = None dset = AutoDataset( range(10), load_data=load_data_func, load_sample=load_sample_func, running_stage=running_stage, ) assert len(dset) == 10 for idx in range(len(dset)): dset[idx] if with_dataset: assert dset.load_sample_was_called assert dset.load_data_was_called assert functions.load_sample_with_dataset_count == len(dset) assert functions.load_data_with_dataset_count == 1 else: assert functions.load_data_count == 1 assert functions.load_sample_count == len(dset)
def _generate_auto_dataset( self, data: Union[Iterable, Any], running_stage: RunningStage = None) -> AutoDataset: return AutoDataset(data=data, data_pipeline=self, running_stage=running_stage)
def load_data(self, metadata: Any, dataset: AutoDataset) -> CustomCOCODataset: # Extract folder, coco annotation file and the transform to be applied on the images folder, ann_file, transform = metadata ds = CustomCOCODataset(folder, ann_file, transform) if self.training: dataset.num_classes = ds.num_classes ds = _coco_remove_images_without_annotations(ds) return ds
def _generate_auto_dataset( self, data: Union[Iterable, Any], running_stage: RunningStage = None, use_iterable_auto_dataset: bool = False ) -> Union[AutoDataset, IterableAutoDataset]: if use_iterable_auto_dataset: return IterableAutoDataset(data, data_pipeline=self, running_stage=running_stage) return AutoDataset(data=data, data_pipeline=self, running_stage=running_stage)
def test_autodataset_warning(): with pytest.warns( UserWarning, match= "``datapipeline`` is specified but load_sample and/or load_data are also specified" ): AutoDataset(range(10), load_data=lambda x: x, load_sample=lambda x: x, data_pipeline=DataPipeline())
def common_load_data(self, df: DataFrame, dataset: AutoDataset): # impute_data # compute train dataset stats dfs = _pre_transform([df], self.num_cols, self.cat_cols, self.codes, self.mean, self.std, self.target_col, self.target_codes) df = dfs[0] dataset.num_samples = len(df) cat_vars = _to_cat_vars_numpy(df, self.cat_cols) num_vars = _to_num_vars_numpy(df, self.num_cols) cat_vars = np.stack(cat_vars, 1) if len(cat_vars) else np.zeros((len(self), 0)) num_vars = np.stack(num_vars, 1) if len(num_vars) else np.zeros((len(self), 0)) return df, cat_vars, num_vars
def generate_dataset( self, data: Optional[DATA_TYPE], running_stage: RunningStage, ) -> Optional[Union[AutoDataset, IterableAutoDataset]]: """Generate a single dataset with the given input to :meth:`~flash.data.data_source.DataSource.load_data` for the given ``running_stage``. Args: data: The input to :meth:`~flash.data.data_source.DataSource.load_data` to use to create the dataset. running_stage: The running_stage for this dataset. Returns: The constructed :class:`~flash.data.auto_dataset.BaseAutoDataset`. """ is_none = data is None if isinstance(data, Sequence): is_none = data[0] is None if not is_none: from flash.data.data_pipeline import DataPipeline mock_dataset = typing.cast(AutoDataset, MockDataset()) with CurrentRunningStageFuncContext(running_stage, "load_data", self): load_data: Callable[[DATA_TYPE, Optional[Any]], Any] = getattr( self, DataPipeline._resolve_function_hierarchy( "load_data", self, running_stage, DataSource, )) parameters = signature(load_data).parameters if len( parameters ) > 1 and "dataset" in parameters: # TODO: This was DATASET_KEY before data = load_data(data, mock_dataset) else: data = load_data(data) if has_len(data): dataset = AutoDataset(data, self, running_stage) else: dataset = IterableAutoDataset(data, self, running_stage) dataset.__dict__.update(mock_dataset.metadata) return dataset
def load_data(self, filepath: str, dataset: AutoDataset, columns: Union[List[str], Tuple[str]] = ("input_ids", "attention_mask", "labels"), use_full: bool = True): data_files = {} stage = dataset.running_stage.value data_files[stage] = str(filepath) # FLASH_TESTING is set in the CI to run faster. if use_full and os.getenv("FLASH_TESTING", "0") == "0": dataset_dict = load_dataset(self.filetype, data_files=data_files) else: # used for debugging. Avoid processing the entire dataset # noqa E265 dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f'{stage}[:20]'])[0] }) dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True) # convert labels to ids if not self.predicting: dataset_dict = dataset_dict.map(self._transform_label) dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True) # Hugging Face models expect target to be named ``labels``. if not self.predicting and self.target != "labels": dataset_dict.rename_column_(self.target, "labels") dataset_dict.set_format("torch", columns=columns) if not self.predicting: dataset.num_classes = len(self.label_to_class_mapping) return dataset_dict[stage]
def load_data(self, data: Tuple[ND, ND], dataset: AutoDataset) -> List[Tuple[ND, float]]: if self.training: dataset.num_inputs = data[0].shape[1] return [(x, y) for x, y in zip(*data)]