Exemple #1
0
 def add(self, uuid):
     """ Adds a key to the HyperLogLog """
     if uuid:
         # Computing the hash
         try:
             x = smhasher.murmur3_x86_64(uuid)
         except UnicodeEncodeError:
             x = smhasher.murmur3_x86_64(uuid.encode('ascii', 'ignore'))
         # Finding the register to update by using thef first b bits as an index
         j = x & ((1 << self.b) - 1)
         # Remove those b bits
         w = x >> self.b
         # Find the first 0 in the remaining bit pattern
         self.M[j] = max(self.M[j], self._get_rho(w, self.bitcount_arr))
Exemple #2
0
 def add(self, uuid):
     """ Adds a key to the HyperLogLog """
     if uuid:
         # Computing the hash
         try:
             x = smhasher.murmur3_x86_64(uuid)
         except UnicodeEncodeError:
             x = smhasher.murmur3_x86_64(uuid.encode('ascii', 'ignore'))
         # Finding the register to update by using thef first b bits as an index
         j = x & ((1 << self.b) - 1)
         # Remove those b bits
         w = x >> self.b
         # Find the first 0 in the remaining bit pattern
         self.M[j] = max(self.M[j], self._get_rho(w, self.bitcount_arr))
 def _make_hashfuncs(key):
     if isinstance(key, unicode):
         key = key.encode('utf-8')
     else:
         key = str(key)
     rval = []
     current_hash = None
     for i in range(nbr_slices):
         seed = current_hash or 0
         current_hash = smhasher.murmur3_x86_64(key, seed)
         rval.append(current_hash % nbr_bits)
     return rval
Exemple #4
0
 def _make_hashfuncs(key):
     if isinstance(key, unicode):
         key = key.encode('utf-8')
     else:
         key = str(key)
     rval = []
     current_hash = None
     for i in range(nbr_slices):
         seed = current_hash or 0
         current_hash = smhasher.murmur3_x86_64(key, seed)
         rval.append(current_hash % nbr_bits)
     return rval
Exemple #5
0
	# split the line by \t 
	doc_id,content = line.split("\t")
	if( (not (doc_id)) or (not(content))):
		print "line not formated properly"	
		break
	# remove punctuations
	content = test_re(content)
	# split into an array of words 	
	content_arr = content.split()
	#print len(content_arr)
	shingle = ""
	# if line len is less than 8 then just emit the whole line as shingle 
	if( len(content_arr) < k):
		shingle = ''.join(content_arr)
		#s = "%s\t%s"%(doc_id,shingle)
		shing_hash = smhasher.murmur3_x86_64(shingle)
		# now output this 64 times 0 - 63 
		for j in xrange(64):
			# calculate the hash of shingle 
			# test j'th Bit 
			bit_num = testBit(shing_hash,j)
			# set bit_out = 1 if its set and bit out = 0 if not
			if bit_num > 0:
				bit_out = 1
			else:
				bit_out = -1
			#print "bit num",j,"is =",bit_out
			# we have to pass the bit as string( can hadoop accept bints / floats )	
			s = "%s,%s\t%s"%(doc_id,str(j),str(bit_out))
			# to avoid overhead of copying why dont u just output +1 			or -1 	
			print s
Exemple #6
0
def hash(tohash):
  '''fast, deterministic hash function'''
  return smhasher.murmur3_x86_64(str(tohash))
Exemple #7
0
#!/usr/bin/env python

import sys

sys.path.append('.')

import smhasher
#import murmurhash 

# input comes from STDIN (standard input)
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()
    # split the line into words
    words = line.split()
    # increase counters
    for word in words:
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        #
        # tab-delimited; the trivial word count is 1
        print '%s\t%s' % (word, str(smhasher.murmur3_x86_64("hello")))
Exemple #8
0
 def get_hash64(self, str):
     return smhasher.murmur3_x86_64(str)
Exemple #9
0
 def hash_bytes(self, s):
     return smhasher.murmur3_x86_64(s)
Exemple #10
0
def hash_str(s):
    return smhasher.murmur3_x86_64(s)
Exemple #11
0
def hash(tohash):
  return smhasher.murmur3_x86_64(str(tohash))
Exemple #12
0
 def get_hash64(self, str):
     return smhasher.murmur3_x86_64(str)