def transform(self):
        """Transform the data contained in the raw data Pandas DataFrame into something a machine learning algorithm can use.
		"""
        print("\t--Beginning:  Pandas dataframe transformation")

        # Capture start time.
        start_time = time.time()

        print("\nPre-transformation:\n")
        print(self.df.describe(include='all'))

        # Use FVN hash to transform string values into a numerical representation.
        print("\tBeginning:  Hashing and scaling string values.")
        hash_alg = pyhash.fnv1_64()

        # Examine the Pandas data frame column by column and hash/scale only
        # columns detected as strings.
        for i in self.df.columns:
            if self.df.dtypes[i] == np.object:
                print("\t\tHashing and scaling column: %s" % i)
                self.df[i] = self.df[i].map(
                    lambda a: (hash_alg(str(a).encode('utf-8'))) / 2**64)

        print("\tFinishd:  Hashing and scaling string values.")

        print("\n\nPost-transformation:")
        print(self.df.describe(include='all'))

        print("\t\tTransformation Time: %.4f seconds" %
              (time.time() - start_time))
        print("\t--Finished:  Pandas dataframe transformation")
Ejemplo n.º 2
0
def test_error_return_none():
    if hasattr(sys, 'getrefcount'):  # skip pypy
        h = pyhash.fnv1_64()

        old_refcnt = sys.getrefcount(None)

        for _ in range(10000):
            try:
                h(None)

                pytest.fail("fail to raise exception")
            except TypeError:
                pass

        new_refcnt = sys.getrefcount(None)

        assert old_refcnt >= new_refcnt
Ejemplo n.º 3
0
    def __init__(self):
        self.h1 = pyhash.fnv1_64()
        self.h2 = pyhash.metro_64()
        self.empty_bucket = [(0, 0)] * 4
        self.TL = []
        self.BL = []
        self.TMP = []
        self.N = 0

        print('Welcome to Level Hashing emulator!')
        t = input('Please configure Top Level size N = 2^')
        try:
            t = int(t)
            print('Set N =', 2**t, ', initialize successfully')
            self.N = 2**t
            self.TL = [self.empty_bucket.copy() for i in range(self.N)]
            self.BL = [self.empty_bucket.copy() for i in range(self.N // 2)]
        except ValueError:
            print('Error : the input is not a int')
            return

        self.cui()
Ejemplo n.º 4
0
    def __init__(self, size=65536, k=7, name='bf', load=False):
        if load:
            self.load(name)
        else:
            self.size = size
            if k > 18 or k <= 0:
                print('k should be > 0 & <= 18')
                return None
            self.k = k
            self.name = name
            self.bitarray = bitarray.bitarray('0' * self.size)
            self.tables = [[set() for j in range(self.size)]
                           for i in range(self.k)]

        self.hashes = [
            pyhash.fnv1_64(),
            pyhash.murmur2_x64_64a(),
            pyhash.murmur3_x64_128(),
            pyhash.lookup3(),
            pyhash.super_fast_hash(),
            pyhash.city_128(),
            pyhash.spooky_128(),
            pyhash.farm_128(),
            pyhash.metro_128(),
            pyhash.mum_64(),
            pyhash.t1_64(),
            pyhash.xx_64(),
            lambda str: int(hashlib.md5(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(hashlib.sha1(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha224(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha256(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha384(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha512(str.encode('utf-8')).hexdigest(), 16)
        ]
Ejemplo n.º 5
0
            #     print ("%s  processado \n"%(feedbackCount*percentFeedback*100))

    # Results output...

    def logValue(file, valueName, value):
        file.write(str(valueName) + "\t" + str(value) + "\n")

    with open("outputs.txt", "w") as outputFile:
        for i in range(len(hashFunctions)):
            logValue(outputFile, "HashFunction", str(i))
            logValue(outputFile, "MaxTail", hashFunctions[i].maximumTail)
            logValue(outputFile, "MaximumTailNumber",
                     hashFunctions[i].maximumTailNumber)


HASHER = pyhash.fnv1_64()


class HashFunction:
    def __init__(self, primeNumber, size):
        self.primeNumber = primeNumber
        # self.coefficient = coefficient
        self.maximumTail = -1
        self.maximumTailNumber = None

        #a = np.random.randint(1, primeNumber)
        #self._hashFunction =lambda x: (a+HASHER(x.tobytes()))

        self._hashFunction = utils.universal_hash(primeNumber, size)
        #self._hashFunction = lambda x: (x)