def check_load_dataset(self, dataset_name, configs, is_local=False, use_local_dummy_data=False): for config in configs: with tempfile.TemporaryDirectory() as processed_temp_dir, tempfile.TemporaryDirectory() as raw_temp_dir: # create config and dataset dataset_builder_cls = self.load_builder_class(dataset_name, is_local=is_local) name = config.name if config is not None else None dataset_builder = dataset_builder_cls(name=name, cache_dir=processed_temp_dir) # TODO: skip Beam datasets and datasets that lack dummy data for now if not dataset_builder.test_dummy_data: logger.info("Skip tests for this dataset for now") return if config is not None: version = config.version else: version = dataset_builder.VERSION def check_if_url_is_valid(url): if is_remote_url(url) and "\\" in url: raise ValueError(f"Bad remote url '{url} since it contains a backslash") # create mock data loader manager that has a special download_and_extract() method to download dummy data instead of real data mock_dl_manager = MockDownloadManager( dataset_name=dataset_name, config=config, version=version, cache_dir=raw_temp_dir, use_local_dummy_data=use_local_dummy_data, download_callbacks=[check_if_url_is_valid], ) # packaged datasets like csv, text, json or pandas require some data files if dataset_builder.__class__.__name__.lower() in _PACKAGED_DATASETS_MODULES: mock_dl_manager.download_dummy_data() path_to_dummy_data = mock_dl_manager.dummy_file dataset_builder.config.data_files = get_packaged_dataset_dummy_data_files( dataset_builder.__class__.__name__.lower(), path_to_dummy_data ) # mock size needed for dummy data instead of actual dataset if dataset_builder.info is not None: # approximate upper bound of order of magnitude of dummy data files one_mega_byte = 2 << 19 dataset_builder.info.size_in_bytes = 2 * one_mega_byte dataset_builder.info.download_size = one_mega_byte dataset_builder.info.dataset_size = one_mega_byte # generate examples from dummy data dataset_builder.download_and_prepare( dl_manager=mock_dl_manager, download_mode=GenerateMode.FORCE_REDOWNLOAD, ignore_verifications=True, try_from_hf_gcs=False, ) # get dataset dataset = dataset_builder.as_dataset(ignore_verifications=True) # check that dataset is not empty self.parent.assertListEqual(sorted(dataset_builder.info.splits.keys()), sorted(dataset)) for split in dataset_builder.info.splits.keys(): # check that loaded datset is not empty self.parent.assertTrue(len(dataset[split]) > 0) del dataset
def check_load_dataset(self, dataset_name, configs, is_local=False): # test only first config to speed up testing for config in configs: with tempfile.TemporaryDirectory( ) as processed_temp_dir, tempfile.TemporaryDirectory( ) as raw_temp_dir: # create config and dataset dataset_builder_cls = self.load_builder_class( dataset_name, is_local=is_local) name = config.name if config is not None else None dataset_builder = dataset_builder_cls( name=name, cache_dir=processed_temp_dir) # TODO: skip Beam datasets and datasets that lack dummy data for now if not dataset_builder.test_dummy_data: logger.info("Skip tests for this dataset for now") return if config is not None: version = config.version else: version = dataset_builder.VERSION # create mock data loader manager that has a special download_and_extract() method to download dummy data instead of real data mock_dl_manager = MockDownloadManager( dataset_name=dataset_name, config=config, version=version, cache_dir=raw_temp_dir, is_local=is_local, ) if dataset_builder.__class__.__name__ == "Csv": # need slight adoption for csv dataset mock_dl_manager.download_dummy_data() path_to_dummy_data = mock_dl_manager.dummy_file dataset_builder.config.data_files = { "train": os.path.join(path_to_dummy_data, "train.csv"), "test": os.path.join(path_to_dummy_data, "test.csv"), "dev": os.path.join(path_to_dummy_data, "dev.csv"), } elif dataset_builder.__class__.__name__ == "Json": # need slight adoption for json dataset mock_dl_manager.download_dummy_data() path_to_dummy_data = mock_dl_manager.dummy_file dataset_builder.config.data_files = { "train": os.path.join(path_to_dummy_data, "train.json"), "test": os.path.join(path_to_dummy_data, "test.json"), "dev": os.path.join(path_to_dummy_data, "dev.json"), } elif dataset_builder.__class__.__name__ == "Pandas": # need slight adoption for json dataset mock_dl_manager.download_dummy_data() path_to_dummy_data = mock_dl_manager.dummy_file dataset_builder.config.data_files = { "train": os.path.join(path_to_dummy_data, "train.pkl"), "test": os.path.join(path_to_dummy_data, "test.pkl"), "dev": os.path.join(path_to_dummy_data, "dev.pkl"), } elif dataset_builder.__class__.__name__ == "Text": mock_dl_manager.download_dummy_data() path_to_dummy_data = mock_dl_manager.dummy_file dataset_builder.config.data_files = { "train": os.path.join(path_to_dummy_data, "train.txt"), "test": os.path.join(path_to_dummy_data, "test.txt"), "dev": os.path.join(path_to_dummy_data, "dev.txt"), } # mock size needed for dummy data instead of actual dataset if dataset_builder.info is not None: # approximate upper bound of order of magnitude of dummy data files one_mega_byte = 2 << 19 dataset_builder.info.size_in_bytes = 2 * one_mega_byte dataset_builder.info.download_size = one_mega_byte dataset_builder.info.dataset_size = one_mega_byte # generate examples from dummy data dataset_builder.download_and_prepare( dl_manager=mock_dl_manager, download_mode=GenerateMode.FORCE_REDOWNLOAD, ignore_verifications=True, try_from_hf_gcs=False, ) # get dataset dataset = dataset_builder.as_dataset() # check that dataset is not empty for split in dataset_builder.info.splits.keys(): # check that loaded datset is not empty self.parent.assertTrue(len(dataset[split]) > 0) del dataset