def deserialize_minhash(column): """ Deserializes minhash binary file for the given column and returns the minhash @param column: @return: """ file_path = f'{os.environ["WORKING_DIRECTORY"]}/results/minhashes/{column["table"]}.{column["column"]}.txt' if not os.path.isfile(file_path): serialize_min_hash([column]) with open(file_path, 'rb') as file: minhash = LeanMinHash.deserialize(bytearray(file.read())) return minhash
def test_deserialize(self): m1 = MinHash(10, 1, hashfunc=fake_hash_func) m1.update(123) lm1 = LeanMinHash(m1) buf = bytearray(lm1.bytesize()) lm1.serialize(buf) # Test if we get back the exact same LeanMinHash objects after # deserializing from bytes lm1d = LeanMinHash.deserialize(buf) self.assertEqual(lm1d.seed, lm1.seed) self.assertEqual(len(lm1d.hashvalues), len(lm1.hashvalues)) self.assertTrue( all(hvd == hv for hv, hvd in zip(lm1.hashvalues, lm1d.hashvalues)))
def test_deserialize_byteorder(self): for byteorder in "@=<>!": m1 = MinHash(10, 1, hashobj=FakeHash) m1.update(123) lm1 = LeanMinHash(m1) buf = bytearray(lm1.bytesize(byteorder)) lm1.serialize(buf, byteorder) # Test if we get back the exact same LeanMinHash objects after # deserializing from bytes lm1d = LeanMinHash.deserialize(buf, byteorder) lm1d.hashobj = FakeHash self.assertEqual(lm1d.seed, lm1.seed) self.assertEqual(len(lm1d.hashvalues), len(lm1.hashvalues)) self.assertTrue( all(hvd == hv for hv, hvd in zip(lm1.hashvalues, lm1d.hashvalues)))