Ejemplo n.º 1
0
 def test_hash_same_strings(self):
     string = "abc"
     obj1 = [string, string]  # two strings have the same ids
     obj2 = [string, string]
     obj3 = json.loads(
         f'["{string}", "{string}"]')  # two strings have different ids
     self.assertIs(obj1[0], string)
     self.assertIs(obj1[0], obj1[1])
     self.assertIs(obj2[0], string)
     self.assertIs(obj2[0], obj2[1])
     self.assertIsNot(obj3[0], string)
     self.assertIsNot(obj3[0], obj3[1])
     hash1 = Hasher.hash(obj1)
     hash2 = Hasher.hash(obj2)
     hash3 = Hasher.hash(obj3)
     self.assertEqual(hash1, hash2)
     self.assertEqual(hash1, hash3)
def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file):
    patterns = {"train": [_TEST_URL], "test": [str(text_file)]}
    data_files1 = DataFilesDict.from_local_or_remote(patterns)
    data_files2 = DataFilesDict.from_local_or_remote(patterns)
    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

    data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True))
    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

    patterns2 = {"train": [_TEST_URL], "test": [_TEST_URL]}
    data_files2 = DataFilesDict.from_local_or_remote(patterns2)
    assert Hasher.hash(data_files1) != Hasher.hash(data_files2)

    with patch("datasets.data_files.request_etag") as mock_request_etag:
        mock_request_etag.return_value = "blabla"
        data_files2 = DataFilesDict.from_local_or_remote(patterns)
        assert Hasher.hash(data_files1) != Hasher.hash(data_files2)

    with patch("datasets.data_files.os.path.getmtime") as mock_getmtime:
        mock_getmtime.return_value = 123
        data_files2 = DataFilesDict.from_local_or_remote(patterns)
        assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
def test_DataFilesDict_from_hf_repo_hashing(hub_dataset_info):
    patterns = {"train": ["**/train.txt"], "test": ["**/test.txt"]}
    data_files1 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info)
    data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info)
    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

    data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True))
    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

    patterns2 = {"train": ["data/train.txt"], "test": ["data/test.txt"]}
    data_files2 = DataFilesDict.from_hf_repo(patterns2, hub_dataset_info)
    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

    patterns2 = {"train": ["data/train.txt"], "test": ["data/train.txt"]}
    data_files2 = DataFilesDict.from_hf_repo(patterns2, hub_dataset_info)
    assert Hasher.hash(data_files1) != Hasher.hash(data_files2)

    with patch.object(hub_dataset_info, "id", "blabla"):
        data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info)
        assert Hasher.hash(data_files1) != Hasher.hash(data_files2)

    with patch.object(hub_dataset_info, "sha", "blabla"):
        data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info)
        assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
Ejemplo n.º 4
0
def test_dependency_on_dill():
    # AttributeError: module 'dill._dill' has no attribute 'stack'
    hasher = Hasher()
    hasher.update(lambda x: x)
Ejemplo n.º 5
0
 def test_hash_unpicklable(self):
     with self.assertRaises(pickle.PicklingError):
         Hasher.hash(UnpicklableCallable(Foo("hello")))
Ejemplo n.º 6
0
 def test_hash_update(self):
     hasher = Hasher()
     for x in ["hello", Foo("hello")]:
         hasher.update(x)
     hash1 = hasher.hexdigest()
     hasher = Hasher()
     for x in ["hello", Foo("hello")]:
         hasher.update(x)
     hash2 = hasher.hexdigest()
     hasher = Hasher()
     for x in ["there", Foo("there")]:
         hasher.update(x)
     hash3 = hasher.hexdigest()
     self.assertEqual(hash1, hash2)
     self.assertNotEqual(hash1, hash3)
Ejemplo n.º 7
0
 def test_hash_class_instance(self):
     hash1 = Hasher.hash(Foo("hello"))
     hash2 = Hasher.hash(Foo("hello"))
     hash3 = Hasher.hash(Foo("there"))
     self.assertEqual(hash1, hash2)
     self.assertNotEqual(hash1, hash3)
Ejemplo n.º 8
0
 def test_hash_simple(self):
     hash1 = Hasher.hash("hello")
     hash2 = Hasher.hash("hello")
     hash3 = Hasher.hash("there")
     self.assertEqual(hash1, hash2)
     self.assertNotEqual(hash1, hash3)
Ejemplo n.º 9
0
    def _create_fingerprint_for_instance_list(self, pipeline: "Pipeline") -> str:
        """Create a fingerprint for the instance list

        The fingerprint is based on:
        - the fingerprint of the previous dataset
        - the tokenizer config
        - the indexer config of the features
        - the biome__version__, allennlp__version__ and spaCy__version__ just to be completely sure!

        Parameters
        ----------
        pipeline
            Pipeline with the tokenizer and indexer config of the features

        Returns
        -------
        fingerprint
            String of hexadecimal digits
        """
        hasher = Hasher()
        hasher.update(self.dataset._fingerprint)  # necessary evil ...
        hasher.update(vars(pipeline.backbone.tokenizer.config))
        for feature in pipeline.config.features:
            hasher.update(feature.config["indexer"])
        hasher.update(biome__version__)
        hasher.update(allennlp__version__)
        hasher.update(spacy__version__)

        return hasher.hexdigest()