def transform(self): """Transform the data contained in the raw data Pandas DataFrame into something a machine learning algorithm can use. """ print("\t--Beginning: Pandas dataframe transformation") # Capture start time. start_time = time.time() print("\nPre-transformation:\n") print(self.df.describe(include='all')) # Use FVN hash to transform string values into a numerical representation. print("\tBeginning: Hashing and scaling string values.") hash_alg = pyhash.fnv1_64() # Examine the Pandas data frame column by column and hash/scale only # columns detected as strings. for i in self.df.columns: if self.df.dtypes[i] == np.object: print("\t\tHashing and scaling column: %s" % i) self.df[i] = self.df[i].map( lambda a: (hash_alg(str(a).encode('utf-8'))) / 2**64) print("\tFinishd: Hashing and scaling string values.") print("\n\nPost-transformation:") print(self.df.describe(include='all')) print("\t\tTransformation Time: %.4f seconds" % (time.time() - start_time)) print("\t--Finished: Pandas dataframe transformation")
def test_error_return_none(): if hasattr(sys, 'getrefcount'): # skip pypy h = pyhash.fnv1_64() old_refcnt = sys.getrefcount(None) for _ in range(10000): try: h(None) pytest.fail("fail to raise exception") except TypeError: pass new_refcnt = sys.getrefcount(None) assert old_refcnt >= new_refcnt
def __init__(self): self.h1 = pyhash.fnv1_64() self.h2 = pyhash.metro_64() self.empty_bucket = [(0, 0)] * 4 self.TL = [] self.BL = [] self.TMP = [] self.N = 0 print('Welcome to Level Hashing emulator!') t = input('Please configure Top Level size N = 2^') try: t = int(t) print('Set N =', 2**t, ', initialize successfully') self.N = 2**t self.TL = [self.empty_bucket.copy() for i in range(self.N)] self.BL = [self.empty_bucket.copy() for i in range(self.N // 2)] except ValueError: print('Error : the input is not a int') return self.cui()
def __init__(self, size=65536, k=7, name='bf', load=False): if load: self.load(name) else: self.size = size if k > 18 or k <= 0: print('k should be > 0 & <= 18') return None self.k = k self.name = name self.bitarray = bitarray.bitarray('0' * self.size) self.tables = [[set() for j in range(self.size)] for i in range(self.k)] self.hashes = [ pyhash.fnv1_64(), pyhash.murmur2_x64_64a(), pyhash.murmur3_x64_128(), pyhash.lookup3(), pyhash.super_fast_hash(), pyhash.city_128(), pyhash.spooky_128(), pyhash.farm_128(), pyhash.metro_128(), pyhash.mum_64(), pyhash.t1_64(), pyhash.xx_64(), lambda str: int(hashlib.md5(str.encode('utf-8')).hexdigest(), 16), lambda str: int(hashlib.sha1(str.encode('utf-8')).hexdigest(), 16), lambda str: int( hashlib.sha224(str.encode('utf-8')).hexdigest(), 16), lambda str: int( hashlib.sha256(str.encode('utf-8')).hexdigest(), 16), lambda str: int( hashlib.sha384(str.encode('utf-8')).hexdigest(), 16), lambda str: int( hashlib.sha512(str.encode('utf-8')).hexdigest(), 16) ]
# print ("%s processado \n"%(feedbackCount*percentFeedback*100)) # Results output... def logValue(file, valueName, value): file.write(str(valueName) + "\t" + str(value) + "\n") with open("outputs.txt", "w") as outputFile: for i in range(len(hashFunctions)): logValue(outputFile, "HashFunction", str(i)) logValue(outputFile, "MaxTail", hashFunctions[i].maximumTail) logValue(outputFile, "MaximumTailNumber", hashFunctions[i].maximumTailNumber) HASHER = pyhash.fnv1_64() class HashFunction: def __init__(self, primeNumber, size): self.primeNumber = primeNumber # self.coefficient = coefficient self.maximumTail = -1 self.maximumTailNumber = None #a = np.random.randint(1, primeNumber) #self._hashFunction =lambda x: (a+HASHER(x.tobytes())) self._hashFunction = utils.universal_hash(primeNumber, size) #self._hashFunction = lambda x: (x)