def test_hash_same_strings(self): string = "abc" obj1 = [string, string] # two strings have the same ids obj2 = [string, string] obj3 = json.loads( f'["{string}", "{string}"]') # two strings have different ids self.assertIs(obj1[0], string) self.assertIs(obj1[0], obj1[1]) self.assertIs(obj2[0], string) self.assertIs(obj2[0], obj2[1]) self.assertIsNot(obj3[0], string) self.assertIsNot(obj3[0], obj3[1]) hash1 = Hasher.hash(obj1) hash2 = Hasher.hash(obj2) hash3 = Hasher.hash(obj3) self.assertEqual(hash1, hash2) self.assertEqual(hash1, hash3)
def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file): patterns = {"train": [_TEST_URL], "test": [str(text_file)]} data_files1 = DataFilesDict.from_local_or_remote(patterns) data_files2 = DataFilesDict.from_local_or_remote(patterns) assert Hasher.hash(data_files1) == Hasher.hash(data_files2) data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True)) assert Hasher.hash(data_files1) == Hasher.hash(data_files2) patterns2 = {"train": [_TEST_URL], "test": [_TEST_URL]} data_files2 = DataFilesDict.from_local_or_remote(patterns2) assert Hasher.hash(data_files1) != Hasher.hash(data_files2) with patch("datasets.data_files.request_etag") as mock_request_etag: mock_request_etag.return_value = "blabla" data_files2 = DataFilesDict.from_local_or_remote(patterns) assert Hasher.hash(data_files1) != Hasher.hash(data_files2) with patch("datasets.data_files.os.path.getmtime") as mock_getmtime: mock_getmtime.return_value = 123 data_files2 = DataFilesDict.from_local_or_remote(patterns) assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
def test_DataFilesDict_from_hf_repo_hashing(hub_dataset_info): patterns = {"train": ["**/train.txt"], "test": ["**/test.txt"]} data_files1 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info) data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info) assert Hasher.hash(data_files1) == Hasher.hash(data_files2) data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True)) assert Hasher.hash(data_files1) == Hasher.hash(data_files2) patterns2 = {"train": ["data/train.txt"], "test": ["data/test.txt"]} data_files2 = DataFilesDict.from_hf_repo(patterns2, hub_dataset_info) assert Hasher.hash(data_files1) == Hasher.hash(data_files2) patterns2 = {"train": ["data/train.txt"], "test": ["data/train.txt"]} data_files2 = DataFilesDict.from_hf_repo(patterns2, hub_dataset_info) assert Hasher.hash(data_files1) != Hasher.hash(data_files2) with patch.object(hub_dataset_info, "id", "blabla"): data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info) assert Hasher.hash(data_files1) != Hasher.hash(data_files2) with patch.object(hub_dataset_info, "sha", "blabla"): data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info) assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
def test_dependency_on_dill(): # AttributeError: module 'dill._dill' has no attribute 'stack' hasher = Hasher() hasher.update(lambda x: x)
def test_hash_unpicklable(self): with self.assertRaises(pickle.PicklingError): Hasher.hash(UnpicklableCallable(Foo("hello")))
def test_hash_update(self): hasher = Hasher() for x in ["hello", Foo("hello")]: hasher.update(x) hash1 = hasher.hexdigest() hasher = Hasher() for x in ["hello", Foo("hello")]: hasher.update(x) hash2 = hasher.hexdigest() hasher = Hasher() for x in ["there", Foo("there")]: hasher.update(x) hash3 = hasher.hexdigest() self.assertEqual(hash1, hash2) self.assertNotEqual(hash1, hash3)
def test_hash_class_instance(self): hash1 = Hasher.hash(Foo("hello")) hash2 = Hasher.hash(Foo("hello")) hash3 = Hasher.hash(Foo("there")) self.assertEqual(hash1, hash2) self.assertNotEqual(hash1, hash3)
def test_hash_simple(self): hash1 = Hasher.hash("hello") hash2 = Hasher.hash("hello") hash3 = Hasher.hash("there") self.assertEqual(hash1, hash2) self.assertNotEqual(hash1, hash3)
def _create_fingerprint_for_instance_list(self, pipeline: "Pipeline") -> str: """Create a fingerprint for the instance list The fingerprint is based on: - the fingerprint of the previous dataset - the tokenizer config - the indexer config of the features - the biome__version__, allennlp__version__ and spaCy__version__ just to be completely sure! Parameters ---------- pipeline Pipeline with the tokenizer and indexer config of the features Returns ------- fingerprint String of hexadecimal digits """ hasher = Hasher() hasher.update(self.dataset._fingerprint) # necessary evil ... hasher.update(vars(pipeline.backbone.tokenizer.config)) for feature in pipeline.config.features: hasher.update(feature.config["indexer"]) hasher.update(biome__version__) hasher.update(allennlp__version__) hasher.update(spacy__version__) return hasher.hexdigest()