def generate_backoff_key(uri, extra_headers): return "backoff:" + binascii.b2a_base64(mmh3.hash_bytes( "/".join( [uri, _get_arguments_key(extra_headers), ) )).strip() def _get_arguments_key(kwargs): return ("|".join( [":".join([key, value]) for key, value in kwargs.iteritems()] ) ) def backoff_http_request( uri, decoderclass=BodyDecoder, extra_headers={}, backoff_key = None ): if not backoff_key: backoff_key = generate_backoff_key(uri, extra_headers) request_url = str(uri) def handle_error(error, *args): error.raiseException() def handle_http_response(response): if (response.code == 200): finishe_d = defer.Deferred() response.deliverBody(decoderclass(finishe_d)) return finishe_d else: pickled_non200_response = pickle.dumps(response) redis_d = Config.redis_client.set( backoff_key, pickled_non200_response, expire = non200_timeout_seconds, only_if_not_exists = True ) raise HTTPRequestError(response.code, "got code %s talking to a remote http server: %s" % (str(response.code), response.phrase)) http_request_d = Core.http_agent.request( 'GET', request_url, Headers(headers), None) pickle.loads(zlib.decompress(redis_result)) http_request_d.addCallback(handle_http_response) http_request_d.addErrback(handle_error, request_url) #http_request_d.addErrback(log.err, uri) return http_request_d
def get_word_offset(buckets_file, buckets_cnt, buckets_offsets, word): hash_bytes = mmh3.hash_bytes(word.encode('utf-8')) bucket_id = abs(hash_bytes.__hash__()) % buckets_cnt bucket_offset = buckets_offsets[bucket_id] buckets_file.seek(bucket_offset) bucket_len = buckets_file.read(4) if bucket_len == '': return None int_decoder = struct.Struct('I') bucket_len = int_decoder.unpack(bucket_len)[0] data = buckets_file.read(__hash_entry_size * bucket_len) l = -1 r = bucket_len while l < r - 1: m = (l + r) / 2 entry_hash_bytes = data[m * __hash_entry_size:m * __hash_entry_size + __hash_bytes_len] if entry_hash_bytes < hash_bytes: l = m else: r = m if r == bucket_len: return None found_hash_bytes = data[r * __hash_entry_size:r * __hash_entry_size + __hash_bytes_len] if hash_bytes == found_hash_bytes: return int_decoder.unpack(data[r * __hash_entry_size + __hash_bytes_len: (r + 1) * __hash_entry_size])[0] return None
def compute_entity_id(entity_key: EntityKeyProto) -> str: """ Compute Entity id given Feast Entity Key for online stores. Remember that Entity here refers to `EntityKeyProto` which is used in some online stores to encode the keys. It has nothing to do with the Entity concept we have in Feast. """ return mmh3.hash_bytes(serialize_entity_key(entity_key)).hex()
def hash_object(o: Any) -> str: """Returns a 16-character hash code of arbitrary Python objects.""" with io.BytesIO() as buffer: dill.dump(o, buffer) hash = mmh3.hash_bytes(buffer.getvalue(), x64arch=True) hash = base64.b32encode(hash).decode("UTF-8") return hash[:16].lower()
def save_dictionary(file_name, dictionary): words_cnt = len(dictionary) buckets_cnt = (words_cnt + __entries_per_bucket - 1) / __entries_per_bucket buckets = [[] for _ in xrange(buckets_cnt)] for word, offset in dictionary.items(): hash_bytes = mmh3.hash_bytes(word.encode('utf-8')) bucket_id = abs(hash_bytes.__hash__()) % buckets_cnt buckets[bucket_id].append((hash_bytes, offset)) bucket_offsets = dict() int_encoder = struct.Struct('I') with open(file_name, 'wb') as out: for id in xrange(buckets_cnt): bucket = buckets[id] bucket.sort(key=lambda b: b[0]) byte_array = bytearray() byte_array.extend(int_encoder.pack(len(bucket))) for (hash_bytes, offset) in bucket: assert __hash_bytes_len == len(hash_bytes) byte_array.extend(hash_bytes) byte_array.extend(int_encoder.pack(offset)) bucket_offsets[id] = out.tell() out.write(byte_array) return buckets_cnt, bucket_offsets
def test_64bit(): if sys.maxsize < (1 << 32): # Skip this test under 32-bit environments return a = np.zeros(2**32, dtype=np.int8) assert mmh3.hash(a) == -1988950868 assert mmh3.hash64(a) == (-6319308327427928234, -8156928649350215884) assert mmh3.hash128(a) == 189813591698865711411311444615608766294 assert mmh3.hash_bytes(a) == b'V\x8f}\xad\x8eNM\xa84\x07FU\x9c\xc4\xcc\x8e'
def fingerprint(self, item): ''' Takes a string and returns its fingerprint in bits. The length of the fingerprint is given by fingerprint_size. To calculate this fingerprint, we hash the string with MurmurHash3 and truncate the hash. ''' item_hash = mmh3.hash_bytes(item) return item_hash[:self.fingerprint_size]
def compute_datastore_entity_id(entity_key: EntityKeyProto) -> str: """ Compute Datastore Entity id given Feast Entity Key. Remember that Datastore Entity is a concept from the Datastore data model, that has nothing to do with the Entity concept we have in Feast. """ return mmh3.hash_bytes(serialize_entity_key(entity_key)).hex()
def index(self, item): ''' Calculate the (first) index of an item in the filter. ''' item_hash = mmh3.hash_bytes(item) # Because of this modular computation, it will be tricky to increase # the capacity of the filter directly return int(codecs.encode(item_hash, 'hex'), 16) % self.capacity
def seq_sim_hash(tokenized_string: Sequence[AnyStr], token_weight: Callable[[AnyStr], float] = lambda token: 1) -> \ Tuple[AnyStr, AnyStr]: """ Returns 2x 16-byte hashes """ length = len(tokenized_string) if length == 0: return ZERO_HASH, ZERO_HASH elif length == 1: hash_bytes = mmh3.hash_bytes(tokenized_string[0]) return hash_bytes, hash_bytes elif length == 3: # Special case: if common approach is used, then hash of the middle element is ignored: # first element hash has weight 2, middle element hash has weight 1, thus, first element always dominates. # Thus, as an exception, put all elements with the same weight. vector1 = [0] * 16 * 8 vector2 = [0] * 16 * 8 hash_bytes0 = mmh3.hash_bytes(tokenized_string[0]) token_weight0 = token_weight(tokenized_string[0]) hash_bytes1 = mmh3.hash_bytes(tokenized_string[1]) token_weight1 = token_weight(tokenized_string[1]) hash_bytes2 = mmh3.hash_bytes(tokenized_string[2]) token_weight2 = token_weight(tokenized_string[2]) add_hash_to_vector(hash_bytes0, vector1, token_weight0) add_hash_to_vector(hash_bytes1, vector1, token_weight1) add_hash_to_vector(hash_bytes1, vector2, token_weight1) add_hash_to_vector(hash_bytes2, vector2, token_weight2) return binarize_vector_to_hash(vector1), binarize_vector_to_hash( vector2) else: vector1 = [0] * 16 * 8 vector2 = [0] * 16 * 8 for i in range(length): token = tokenized_string[i] hash_bytes = mmh3.hash_bytes(token) weight = token_weight(token) count_after = length - 1 - i if count_after > 0: add_hash_to_vector(hash_bytes, vector1, count_after * weight) count_before = i if count_before > 0: add_hash_to_vector(hash_bytes, vector2, count_before * weight) return binarize_vector_to_hash(vector1), binarize_vector_to_hash( vector2)
def hash_kwargs(doc): """ Create a hash from the values of a document. TODO maybe add the keys too? """ acc = b'' for k in sorted(doc.keys()): v = doc[k] acc += bytes_hasher[cls_finder(v)](v) hash_ = mmh3.hash_bytes(acc) return binascii.hexlify(hash_).decode()
def fingerprint(self, item): ''' Take an item and returns its fingerprint in bits. The fingerprint of an item is computed by truncating its Murmur hashing (murmur3) to the fingerprint size. Return a bit array representation of the fingerprint. ''' mmh3_hash = bitarray() mmh3_hash.frombytes(mmh3.hash_bytes(item)) # Only get up to the size of the fingerprint return mmh3_hash[:self.fingerprint_size]
def write_urls(batch_size): t = time.time() tp = turnip.Turnip() batch = [] cnt = 0 for url in sys.stdin: cnt = cnt + 1 batch.append((mmh3.hash_bytes(url), url)) if len(batch) >= batch_size: tp.write_batch(batch) batch = [] tp.write_batch(batch) print 'write ', time.time() - t, cnt / (time.time()-t)
def obtain_index_from_hash(self, string_item): hash_value = mmh3.hash_bytes(string_item) # this is new for python 3, i.e. how you go from # bytes/bits to int/index values index = int.from_bytes(hash_value, byteorder="big") # modulo the obtained index by the filter capacity # this helps to restrict indices to 0 - filter_capacity index = index % self.filter_capacity return index
def add_query_profiler_data(self, query_without_params: str, params: Union[list, str, None], target_db: str, query_execution_time_in_micros: int, db_row_count: Optional[int]) -> None: """ This function adds to the bucket in the last index of the list, if the profiler is on """ if not self._query_profiler_enabled: return start_time = time() if self._current_query_profiler_level.normalize_sql and params: sql_normalized = re.sub(RE_NORMALIZE_REPEATED_PARAMS_PERCENT, '%s', query_without_params) else: sql_normalized = query_without_params app_stack_trace, django_stack_trace = find_stack_trace( app_module_names_to_exclude=settings. DJANGO_QUERY_PROFILER_APP_MODULES_TO_EXCLUDE, django_module_names_to_include=(django_base_model.__name__, ), max_depth=self._current_query_profiler_level.stack_trace_depth) # New query_signature & query_signature_statistics instances query_signature = QuerySignature(query_without_params=sql_normalized, app_stack_trace=app_stack_trace, django_stack_trace=django_stack_trace, target_db=target_db) query_signature_statistics = QuerySignatureStatistics( frequency= 1, # Number of sql calls would be 1, when we entered this block query_execution_time_in_micros=query_execution_time_in_micros, db_row_count=db_row_count) query_params_db_key = (query_without_params, params or '', target_db) query_params_db_key_hash = hexlify( mmh3.hash_bytes(str(query_params_db_key))) new_query_profiled_data = QueryProfiledData( query_signature_to_query_signature_statistics={ query_signature: query_signature_statistics }, _query_params_db_hash_counter=Counter( {query_params_db_key_hash: 1}), time_spent_profiling_in_micros=int( (time() - start_time) * 1000 * 1000)) # Add to existing data and set it back existing_query_profiled_data: QueryProfiledData = self._query_profiled_data_list[ -1] combined_query_profiled_data: QueryProfiledData = existing_query_profiled_data + new_query_profiled_data self._query_profiled_data_list[-1] = combined_query_profiled_data
def hash_to_64(value: str, count: int = 1) -> List[int]: """ gets a list of numbers between 0 and 63 for use in a filter vector :param value: string to be hashed :param count: must be less than 6 :return: """ digest = mmh3.hash_bytes(value) results = [] for i in range(0, count): bytes = digest[i * 2:i * 2 + 2] value = struct.unpack("h", bytes) results.append(value[0] % 64) return results
def hash_to_64(value: str, count:int=1) -> List[int]: """ gets a list of numbers between 0 and 63 for use in a filter vector :param value: string to be hashed :param count: must be less than 6 :return: """ digest = mmh3.hash_bytes(value) results = [] for i in range(0, count): bytes = digest[i * 2:i * 2 + 2] value = struct.unpack("h", bytes) results.append(value[0] % 64) return results
def find_duplicates(db, warc, options): for record in ArchiveIterator(warc): id_ = get_record_id(record) try: text = get_record_text_content(record) except ValueError as e: logging.error(e) continue text_hash = mmh3.hash_bytes(text) seen = db.get(text_hash, None) byte_id = id_.encode('utf-8') if seen is None: db[text_hash] = byte_id elif seen == byte_id: pass # same record else: seen = seen.decode('utf-8') print(f'{id_}\t{seen}')
def hashfileobject(f, sample_threshhold=SAMPLE_THRESHOLD, sample_size=SAMPLE_SIZE, hexdigest=False): #get file size from file object f.seek(0, os.SEEK_END) size = f.tell() f.seek(0, os.SEEK_SET) if size < sample_threshhold or sample_size < 1: data = f.read() else: data = f.read(sample_size) f.seek(size//2) data += f.read(sample_size) f.seek(-sample_size, os.SEEK_END) data += f.read(sample_size) hash_tmp = mmh3.hash_bytes(data) hash_ = hash_tmp[7::-1] + hash_tmp[16:7:-1] enc_size = varint.encode(size) digest = enc_size + hash_[len(enc_size):] return binascii.hexlify(digest).decode() if hexdigest else digest
def hashfile(filename, sample_threshhold=SAMPLE_THRESHOLD, sample_size=SAMPLE_SIZE, hexdigest=False): size = os.path.getsize(filename) with open(filename, 'rb') as f: if size < sample_threshhold or sample_size < 1: data = f.read() else: data = f.read(sample_size) f.seek(size // 2) data += f.read(sample_size) f.seek(-sample_size, os.SEEK_END) data += f.read(sample_size) hash_tmp = mmh3.hash_bytes(data) hash_ = hash_tmp[7::-1] + hash_tmp[16:7:-1] enc_size = varint.encode(size) digest = enc_size + hash_[len(enc_size):] return binascii.hexlify(digest) if hexdigest else digest
def hashfileobject(f, sample_threshhold=SAMPLE_THRESHOLD, sample_size=SAMPLE_SIZE, hexdigest=False): #get file size from file object f.seek(0, os.SEEK_END) size = f.tell() f.seek(0, os.SEEK_SET) if size < sample_threshhold or sample_size < 1: data = f.read() else: data = f.read(sample_size) f.seek(size//2) data += f.read(sample_size) f.seek(-sample_size, os.SEEK_END) data += f.read(sample_size) hash_tmp = mmh3.hash_bytes(data) hash_ = hash_tmp[7::-1] + hash_tmp[16:7:-1] enc_size = varint.encode(size) digest = enc_size + hash_[len(enc_size):] f.seek(0, os.SEEK_SET) return binascii.hexlify(digest).decode() if hexdigest else digest
def hash128(content): return mmh3.hash_bytes(content)
def obfuscate(self, blob): return str(mmh3.hash_bytes(blob))
def generateIPV6Address(): # generate 128 random bits as byte array return mmh3.hash_bytes('', incrementSeed())
def chunk_hash(data): return b16encode(mmh3.hash_bytes(data)).decode('ascii')
def chunk_uuid(data): return b16encode(mmh3.hash_bytes(uuid1().bytes)).decode('ascii')
def _hashDigest(self, key, val, iv): return mmh3.hash_bytes(str(val) + str(iv), key)
def hash_file(path): with open(path, "rb") as file: return mmh3.hash_bytes(file.read()).hex()
def fetch_artist_cover(self, artist_id): try: artist = get_database().query(Artist).filter_by(id=artist_id).one() except NoResultFound: return remotes_artist = None tries = 0 # try and sleep until we get the remotes_artist. while remotes_artist is None and tries < 8: remotes_artist = remotes.get_artist(artist) tries += 1 if remotes_artist is None: # exponential backoff time.sleep(tries ** 2) lastfm_artist = None if remotes_artist is not None: lastfm_artist = remotes_artist["lastfm"] if lastfm_artist is None or lastfm_artist["cover"] is None: google_images = google.get_artist_images(artist) if google_images is not None: urls = google_images else: return else: urls = [lastfm_artist["cover"]] cover = None for url in urls: cover, resize_cover, resize_cover_large, cover_ext = self.retrieve_and_resize(url) if cover is None: continue if cover is None: return track_dirs = set() for track in artist.tracks: for path in track.paths: track_dirs.add(os.path.dirname(path.path)) for track_dir in track_dirs: if not os.path.exists(track_dir): os.makedirs(track_dir) cover_dest = os.path.join(track_dir, ("%s%s" % (artist.slug, cover_ext)).encode("utf8")) if not os.path.exists(cover_dest): with open(cover_dest, "wb") as file: file.write(cover) artist.cover_path = cover_dest import mmh3 artist.cover = resize_cover artist.cover_large = resize_cover_large artist.cover_hash = base64.b64encode(mmh3.hash_bytes(artist.cover)) try: get_database().commit() except StaleDataError: # artist was removed, ignore get_database().rollback() return ws.emit_all("covers.artist.update", artist.id)
def _normalize_key(key): return base64.encodebytes(mmh3.hash_bytes(key)).strip()
def key(self): b = mmh3.hash_bytes(self.name)[:self.name_hash_size] return b + self.name
def add_string( self, value ): """Adds a string to the record key byte array.""" string_hash = mmh3.hash_bytes( value ) self.buffer_value.append( bytearray( string_hash ) )
import random import sys from string import ascii_letters, digits, punctuation try: import mmh3 except: raise ImportError("Run `pip install mmh3` to install mmh3 for test") visible = ascii_letters + digits + punctuation + " \t" def get_random_word(): return ''.join( random.choice(visible) for i in range(random.randint(1, 20))).rstrip() with open(sys.argv[1], "w") as f: py_32bit_out = open("python_32.out", "w") py_128bit_out = open("python_128.out", "w") for i in range(1000): word = get_random_word() f.write(word + "\n") py_32bit_out.write(str(mmh3.hash(word, i) & 0xffffffff) + "\n") py_128bit_out.write(mmh3.hash_bytes(word, i) + "\n")
def compute_hashes( self ): """Compute the Murmur hash of the key. """ self.routing_hash = mmh3.hash_bytes( self.buffer_value ) self.hash_code = int( self.routing_hash ^ (self.routing_hash >> 32) )
def test_hash_bytes(): assert mmh3.hash_bytes( "foo") == b"aE\xf5\x01W\x86q\xe2\x87}\xba+\xe4\x87\xaf~"
def test_bucket(self): ''' Adding and deleting items in a bucket. ''' bucket = Bucket() # By default, a bucket has the capacity of 4 cases = [ { 'item': '192.168.1.190', 'transformer': lambda string: string, 'action': bucket.insert, 'expected': True, 'full': False, 'included': True, }, { 'item': '192.168.1.191', 'transformer': lambda string: str(int(IPAddress(string))), 'action': bucket.insert, 'expected': True, 'full': False, 'included': True, }, { 'item': '192.168.1.192', 'transformer': lambda string: string, 'action': bucket.insert, 'expected': True, 'full': False, 'included': True, }, { 'item': '192.168.1.193', 'transformer': lambda string: str(int(IPAddress(string))), 'action': bucket.insert, 'expected': True, 'full': True, 'included': True, }, { 'item': '192.168.1.194', 'transformer': lambda string: string, 'action': bucket.insert, 'expected': False, 'full': True, 'included': False, }, { 'item': '192.168.1.195', 'transformer': lambda string: str(int(IPAddress(string))), 'action': bucket.insert, 'expected': False, 'full': True, 'included': False, }, { 'item': '192.168.1.195', 'transformer': lambda string: str(int(IPAddress(string))), 'action': bucket.delete, 'expected': False, 'full': True, 'included': False, }, { 'item': '192.168.1.192', 'transformer': lambda string: string, 'action': bucket.delete, 'expected': True, 'full': False, 'included': False, }, { 'item': '192.168.1.193', 'transformer': lambda string: str(int(IPAddress(string))), 'action': bucket.delete, 'expected': True, 'full': False, 'included': False, }, { 'item': '192.168.1.193', 'transformer': lambda string: str(int(IPAddress(string))), 'action': bucket.insert, 'expected': True, 'full': False, 'included': True, }, # Add the same item again { 'item': '192.168.1.193', 'transformer': lambda string: str(int(IPAddress(string))), 'action': bucket.insert, 'expected': True, 'full': True, 'included': True, }, # Remove a duplicated item { 'item': '192.168.1.193', 'transformer': lambda string: str(int(IPAddress(string))), 'action': bucket.delete, 'expected': True, 'full': False, 'included': True, }, # Remove the last copy of the duplicated item { 'item': '192.168.1.193', 'transformer': lambda string: str(int(IPAddress(string))), 'action': bucket.delete, 'expected': True, 'full': False, 'included': False, }, ] for case in cases: item = case['transformer'](case['item']) # Generate all the fingerprints fingerprint = bitarray() fingerprint.frombytes(mmh3.hash_bytes(item)) self.assertEqual(case['action'](fingerprint), case['expected'], 'Save {0} into the bucket ok'.format(item)) self.assertEqual(bucket.is_full(), case['full'], 'Bucket capacity is ok') # Make sure that all items are in the bucket self.assertEqual(bucket.contains(fingerprint), case['included'], 'Item {0} is in the bucket'.format(item)) self.assertEqual(fingerprint in bucket, case['included'], 'Item {0} is in the bucket'.format(item))
def blockHash(block): hashBytes = mmh3.hash_bytes(block) return binascii.hexlify(hashBytes)
def test_hash_bytes(): assert mmh3.hash_bytes( 'foo') == b'aE\xf5\x01W\x86q\xe2\x87}\xba+\xe4\x87\xaf~'
def hash(data): return binascii.hexlify(mmh3.hash_bytes(data)).decode('ascii')
def test_hash_bytes(self): h = mmh3.hash_bytes('foo') assert h == b'aE\xf5\x01W\x86q\xe2\x87}\xba+\xe4\x87\xaf~'
def _hash_murmurhash(buf): """ Produce a 16-bytes hash of *buf* using MurmurHash. """ return mmh3.hash_bytes(buf)
def generateByteArray(size): # generate 128 random bits as byte array ba = mmh3.hash_bytes('', incrementSeed()) for i in xrange(size / 8): ba = ba + mmh3.hash_bytes('', incrementSeed()) return bytearray(ba[0:size])
def obtain_fingerprint(self, string_item): hash_value = mmh3.hash_bytes(string_item) fingerprint = hash_value[:self.item_fingerprint_size] return fingerprint
def generateByteArray(size): # generate 128 random bits as byte array ba = mmh3.hash_bytes('', incrementSeed()) for i in xrange(size/8): ba = ba + mmh3.hash_bytes('', incrementSeed()) return bytearray(ba[0:size])
def get_cover(self, type, slug, size="default"): if type not in ["album", "artist"]: raise ValueError("Invalid type %s supplied" % type) entity = None if type == "album": entity = library_dao.get_album_by_slug(slug) if entity is None: raise ValueError("Entity not found") remotes.update_album(entity) if entity.cover_path is None or not os.path.exists(entity.cover_path): try: cherrypy.engine.bgtask.put_unique(self.fetch_album_cover, 15, entity.id) except NonUniqueQueueError: pass elif type == "artist": entity = library_dao.get_artist_by_slug(slug) if entity is None: raise ValueError("Entity not found") remotes.update_artist(entity) if entity.cover_path is None or not os.path.exists(entity.cover_path): try: cherrypy.engine.bgtask.put_unique(self.fetch_artist_cover, 15, entity.id) except NonUniqueQueueError: pass if entity is None: raise ValueError("Entity not found") if entity.cover_path is not None: if entity.cover is None: cover_ext = os.path.splitext(entity.cover_path)[1].decode("utf8") temp_cover = self._mktemp(cover_ext).encode("utf8") temp_cover_large = self._mktemp(cover_ext).encode("utf8") cover = image_service.resize( entity.cover_path, temp_cover, Covers.DEFAULT_WIDTH, Covers.DEFAULT_HEIGHT, Covers.DEFAULT_GRAVITY ) large_offset = self._get_image_offset(Covers.LARGE_WIDTH, Covers.LARGE_HEIGHT, Covers.LARGE_GRAVITY) cover_large = image_service.resize( entity.cover_path, temp_cover_large, Covers.LARGE_WIDTH, Covers.LARGE_HEIGHT, Covers.LARGE_GRAVITY, large_offset, ) if cover and cover_large: import mmh3 with open(temp_cover, "rb") as file: entity.cover = file.read() entity.cover_hash = base64.b64encode(mmh3.hash_bytes(entity.cover)) with open(temp_cover_large, "rb") as file: entity.cover_large = file.read() os.remove(temp_cover) os.remove(temp_cover_large) get_database().commit() return self.guess_mime(entity), entity.cover_large if size == "large" else entity.cover return None, None
def index_hash(self, item): '''Calculate the (first) index of an item in the filter.''' item_hash = mmh3.hash_bytes(item) index = int.from_bytes(item_hash, byteorder='big') % self.capacity return index