Beispiel #1
0
 def test_hash_same_strings(self):
     string = "abc"
     obj1 = [string, string]  # two strings have the same ids
     obj2 = [string, string]
     obj3 = json.loads(
         f'["{string}", "{string}"]')  # two strings have different ids
     self.assertIs(obj1[0], string)
     self.assertIs(obj1[0], obj1[1])
     self.assertIs(obj2[0], string)
     self.assertIs(obj2[0], obj2[1])
     self.assertIsNot(obj3[0], string)
     self.assertIsNot(obj3[0], obj3[1])
     hash1 = Hasher.hash(obj1)
     hash2 = Hasher.hash(obj2)
     hash3 = Hasher.hash(obj3)
     self.assertEqual(hash1, hash2)
     self.assertEqual(hash1, hash3)
def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file):
    patterns = {"train": [_TEST_URL], "test": [str(text_file)]}
    data_files1 = DataFilesDict.from_local_or_remote(patterns)
    data_files2 = DataFilesDict.from_local_or_remote(patterns)
    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

    data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True))
    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

    patterns2 = {"train": [_TEST_URL], "test": [_TEST_URL]}
    data_files2 = DataFilesDict.from_local_or_remote(patterns2)
    assert Hasher.hash(data_files1) != Hasher.hash(data_files2)

    with patch("datasets.data_files.request_etag") as mock_request_etag:
        mock_request_etag.return_value = "blabla"
        data_files2 = DataFilesDict.from_local_or_remote(patterns)
        assert Hasher.hash(data_files1) != Hasher.hash(data_files2)

    with patch("datasets.data_files.os.path.getmtime") as mock_getmtime:
        mock_getmtime.return_value = 123
        data_files2 = DataFilesDict.from_local_or_remote(patterns)
        assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
def test_DataFilesDict_from_hf_repo_hashing(hub_dataset_info):
    patterns = {"train": ["**/train.txt"], "test": ["**/test.txt"]}
    data_files1 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info)
    data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info)
    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

    data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True))
    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

    patterns2 = {"train": ["data/train.txt"], "test": ["data/test.txt"]}
    data_files2 = DataFilesDict.from_hf_repo(patterns2, hub_dataset_info)
    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

    patterns2 = {"train": ["data/train.txt"], "test": ["data/train.txt"]}
    data_files2 = DataFilesDict.from_hf_repo(patterns2, hub_dataset_info)
    assert Hasher.hash(data_files1) != Hasher.hash(data_files2)

    with patch.object(hub_dataset_info, "id", "blabla"):
        data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info)
        assert Hasher.hash(data_files1) != Hasher.hash(data_files2)

    with patch.object(hub_dataset_info, "sha", "blabla"):
        data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info)
        assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
Beispiel #4
0
 def test_hash_unpicklable(self):
     with self.assertRaises(pickle.PicklingError):
         Hasher.hash(UnpicklableCallable(Foo("hello")))
Beispiel #5
0
 def test_hash_class_instance(self):
     hash1 = Hasher.hash(Foo("hello"))
     hash2 = Hasher.hash(Foo("hello"))
     hash3 = Hasher.hash(Foo("there"))
     self.assertEqual(hash1, hash2)
     self.assertNotEqual(hash1, hash3)
Beispiel #6
0
 def test_hash_simple(self):
     hash1 = Hasher.hash("hello")
     hash2 = Hasher.hash("hello")
     hash3 = Hasher.hash("there")
     self.assertEqual(hash1, hash2)
     self.assertNotEqual(hash1, hash3)