def test_hash_update(self): hasher = Hasher() for x in ["hello", Foo("hello")]: hasher.update(x) hash1 = hasher.hexdigest() hasher = Hasher() for x in ["hello", Foo("hello")]: hasher.update(x) hash2 = hasher.hexdigest() hasher = Hasher() for x in ["there", Foo("there")]: hasher.update(x) hash3 = hasher.hexdigest() self.assertEqual(hash1, hash2) self.assertNotEqual(hash1, hash3)
def _create_fingerprint_for_instance_list(self, pipeline: "Pipeline") -> str: """Create a fingerprint for the instance list The fingerprint is based on: - the fingerprint of the previous dataset - the tokenizer config - the indexer config of the features - the biome__version__, allennlp__version__ and spaCy__version__ just to be completely sure! Parameters ---------- pipeline Pipeline with the tokenizer and indexer config of the features Returns ------- fingerprint String of hexadecimal digits """ hasher = Hasher() hasher.update(self.dataset._fingerprint) # necessary evil ... hasher.update(vars(pipeline.backbone.tokenizer.config)) for feature in pipeline.config.features: hasher.update(feature.config["indexer"]) hasher.update(biome__version__) hasher.update(allennlp__version__) hasher.update(spacy__version__) return hasher.hexdigest()