def load_builder_class(self, dataset_name, is_local=False): # Download/copy dataset script if is_local is True: module_path, _ = prepare_module("./datasets/" + dataset_name) else: module_path, _ = prepare_module(dataset_name, download_config=DownloadConfig(force_download=True)) # Get dataset builder class builder_cls = import_main_class(module_path) return builder_cls
def test_load_real_dataset(self, dataset_name): path = "./datasets/" + dataset_name module_path, hash = prepare_module(path, download_config=DownloadConfig(local_files_only=True), dataset=True) builder_cls = import_main_class(module_path, dataset=True) name = builder_cls.BUILDER_CONFIGS[0].name if builder_cls.BUILDER_CONFIGS else None with tempfile.TemporaryDirectory() as temp_cache_dir: dataset = load_dataset( path, name=name, cache_dir=temp_cache_dir, download_mode=GenerateMode.FORCE_REDOWNLOAD ) for split in dataset.keys(): self.assertTrue(len(dataset[split]) > 0) del dataset
def test_load_real_dataset_all_configs(self, dataset_name): path = dataset_name module_path, hash = prepare_module(path, download_config=DownloadConfig(force_download=True), dataset=True) builder_cls = import_main_class(module_path, dataset=True) config_names = ( [config.name for config in builder_cls.BUILDER_CONFIGS] if len(builder_cls.BUILDER_CONFIGS) > 0 else [None] ) for name in config_names: with tempfile.TemporaryDirectory() as temp_cache_dir: dataset = load_dataset( path, name=name, cache_dir=temp_cache_dir, download_mode=GenerateMode.FORCE_REDOWNLOAD ) for split in dataset.keys(): self.assertTrue(len(dataset[split]) > 0) del dataset
def get_builder(path, name, data_dir=None, cache_dir=None): module_path, hash, resolved_file_path = prepare_module( path, dataset=True, return_resolved_file_path=True, ) builder_cls = import_main_class(module_path, dataset=True) builder_instance = builder_cls( cache_dir=cache_dir, name=name, data_dir=data_dir, hash=hash, ) return builder_instance