def test_DataFilesDict_from_hf_repo_with_base_path(hub_dataset_info, pattern, size, base_path, split_name): if size > 0: data_files = DataFilesDict.from_hf_repo({split_name: [pattern]}, hub_dataset_info, base_path=base_path) assert len(data_files[split_name]) == size else: with pytest.raises(FileNotFoundError): data_files = DataFilesDict.from_hf_repo({split_name: [pattern]}, hub_dataset_info, base_path=base_path)
def test_DataFilesDict_from_hf_repo(hub_dataset_info, hub_dataset_info_patterns_results, pattern): split_name = "train" try: data_files = DataFilesDict.from_hf_repo({split_name: [pattern]}, hub_dataset_info) assert all(isinstance(data_files_list, DataFilesList) for data_files_list in data_files.values()) assert sorted(str(f) for f in data_files[split_name]) == hub_dataset_info_patterns_results[pattern] assert all(isinstance(url, Url) for url in data_files[split_name]) except FileNotFoundError: assert len(hub_dataset_info_patterns_results[pattern]) == 0
def test_DataFilesDict_from_hf_repo_hashing(hub_dataset_info): patterns = {"train": ["**/train.txt"], "test": ["**/test.txt"]} data_files1 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info) data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info) assert Hasher.hash(data_files1) == Hasher.hash(data_files2) data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True)) assert Hasher.hash(data_files1) == Hasher.hash(data_files2) patterns2 = {"train": ["data/train.txt"], "test": ["data/test.txt"]} data_files2 = DataFilesDict.from_hf_repo(patterns2, hub_dataset_info) assert Hasher.hash(data_files1) == Hasher.hash(data_files2) patterns2 = {"train": ["data/train.txt"], "test": ["data/train.txt"]} data_files2 = DataFilesDict.from_hf_repo(patterns2, hub_dataset_info) assert Hasher.hash(data_files1) != Hasher.hash(data_files2) with patch.object(hub_dataset_info, "id", "blabla"): data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info) assert Hasher.hash(data_files1) != Hasher.hash(data_files2) with patch.object(hub_dataset_info, "sha", "blabla"): data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info) assert Hasher.hash(data_files1) != Hasher.hash(data_files2)