def __init__(self, size, num_values, num_hashes=None): """Simple implementation of a Bloom filter. It stores a bit array internally of @size bits and expects @num_values to be inserted. @size - how many bits the bitarray has. the larger the less the chance of a false positive @num_values - number of distinct values we expect to insert/query in the bloom filter @num_hashes - number of hash functions (optional). If not provided, it's calculated from @size and @num_values """ self.size = size self.bitArr = bitarray(size) self.bitArr.setall(False) self.hasher = murmur3_x64_128() # Number of hash functions that minimizes the # probability of false positives if num_hashes is None: self.numHashes = max(5, int(log(2)*size/num_values)) else: self.numHashes = num_hashes
def __init__(self, capacity, error_rate=0.0001, fname=None, h1=pyhash.murmur3_x64_128(), h2=pyhash.spooky_128()): """ :param capacity: size of possible input elements :param error_rate: posi :param fname: :param h1: :param h2: """ # calculate m & k self.capacity = capacity self.error_rate = error_rate self.num_of_bits, self.num_of_hashes = self._adjust_param( 4096 * 8, error_rate) self._fname = fname self._data_store = MmapBitSet(self.num_of_bits) self._size = len(self._data_store) self._hashes = functools.partial(hashes, h1=h1, h2=h2, number=self.num_of_hashes)
def __init__(self, size, num_values, num_hashes=None): """Simple implementation of a Bloom filter. It stores a bit array internally of @size bits and expects @num_values to be inserted. @size - how many bits the bitarray has. the larger the less the chance of a false positive @num_values - number of distinct values we expect to insert/query in the bloom filter @num_hashes - number of hash functions (optional). If not provided, it's calculated from @size and @num_values """ self.size = size self.bitArr = bitarray(size) self.bitArr.setall(False) self.hasher = murmur3_x64_128() # Number of hash functions that minimizes the # probability of false positives if num_hashes is None: self.numHashes = max(5, int(log(2) * size / num_values)) else: self.numHashes = num_hashes
def save_javascript_content(ldb_socket, logger, browser_params, msg): """ Save javascript files de-duplicated and compressed on disk """ if not browser_params['save_javascript']: return # Check if this response is javascript content is_js = False if (len(msg.response.headers['Content-Type']) > 0 and 'javascript' in msg.response.headers['Content-Type'][0]): is_js = True if not is_js and urlparse(msg.request.url).path.split('.')[-1] == 'js': is_js = True if not is_js: return # Decompress any content with compression # We want files to hash to the same value # Firefox currently only accepts gzip/deflate script = '' content_encoding = msg.response.headers['Content-Encoding'] if (len(content_encoding) == 0 or content_encoding[0].lower() == 'utf-8' or content_encoding[0].lower() == 'identity' or content_encoding[0].lower() == 'none' or content_encoding[0].lower() == 'ansi_x3.4-1968' or content_encoding[0].lower() == 'utf8' or content_encoding[0] == ''): script = msg.response.content elif 'gzip' in content_encoding[0].lower(): try: script = zlib.decompress(msg.response.content, zlib.MAX_WBITS|16) except zlib.error as e: logger.error('BROWSER %i: Received zlib error when trying to decompress gzipped javascript: %s' % (browser_params['crawl_id'],str(e))) return elif 'deflate' in content_encoding[0].lower(): try: script = zlib.decompress(msg.response.content, -zlib.MAX_WBITS) except zlib.error as e: logger.error('BROWSER %i: Received zlib error when trying to decompress deflated javascript: %s' % (browser_params['crawl_id'],str(e))) return else: logger.error('BROWSER %i: Received Content-Encoding %s. Not supported by Firefox, skipping archive.' % (browser_params['crawl_id'], str(content_encoding))) return ldb_socket.send(script) # Hash script for deduplication on disk hasher = pyhash.murmur3_x64_128() script_hash = str(hasher(script) >> 64) return script_hash
def process_script(script, batch, db, counter, logger): """ adds a script to the batch """ # Hash script for deduplication on disk hasher = pyhash.murmur3_x64_128() script_hash = str(hasher(script) >> 64) if db.get(script_hash) is not None: return counter compressed_script = zlib.compress(script) batch.put(script_hash, compressed_script) return counter + 1
def process_script(script, batch, db, counter, logger): """ adds a script to the batch """ #if len(record) != 2: # logger.error("Unsupported record (incorrect length): %s" % str(record)) # return #key = record[0] #value = record[1] # Hash script for deduplication on disk hasher = pyhash.murmur3_x64_128() script_hash = str(hasher(script) >> 64) if db.get(script_hash) is not None: return compressed_script = zlib.compress(script) batch.put(script_hash, compressed_script) counter += 1
def __init__(self, size=65536, k=7, name='bf', load=False): if load: self.load(name) else: self.size = size if k > 18 or k <= 0: print('k should be > 0 & <= 18') return None self.k = k self.name = name self.bitarray = bitarray.bitarray('0' * self.size) self.tables = [[set() for j in range(self.size)] for i in range(self.k)] self.hashes = [ pyhash.fnv1_64(), pyhash.murmur2_x64_64a(), pyhash.murmur3_x64_128(), pyhash.lookup3(), pyhash.super_fast_hash(), pyhash.city_128(), pyhash.spooky_128(), pyhash.farm_128(), pyhash.metro_128(), pyhash.mum_64(), pyhash.t1_64(), pyhash.xx_64(), lambda str: int(hashlib.md5(str.encode('utf-8')).hexdigest(), 16), lambda str: int(hashlib.sha1(str.encode('utf-8')).hexdigest(), 16), lambda str: int( hashlib.sha224(str.encode('utf-8')).hexdigest(), 16), lambda str: int( hashlib.sha256(str.encode('utf-8')).hexdigest(), 16), lambda str: int( hashlib.sha384(str.encode('utf-8')).hexdigest(), 16), lambda str: int( hashlib.sha512(str.encode('utf-8')).hexdigest(), 16) ]
def save_javascript_content(ldb_socket, logger, browser_params, msg): """ Save javascript files de-duplicated and compressed on disk """ if not browser_params["save_javascript"]: return # Check if this response is javascript content is_js = False if len(msg.response.headers["Content-Type"]) > 0 and "javascript" in msg.response.headers["Content-Type"][0]: is_js = True if not is_js and urlparse(msg.request.url).path.split(".")[-1] == "js": is_js = True if not is_js: return # Decompress any content with compression # We want files to hash to the same value # Firefox currently only accepts gzip/deflate script = "" content_encoding = msg.response.headers["Content-Encoding"] if ( len(content_encoding) == 0 or content_encoding[0].lower() == "utf-8" or content_encoding[0].lower() == "identity" or content_encoding[0].lower() == "none" or content_encoding[0].lower() == "ansi_x3.4-1968" or content_encoding[0].lower() == "utf8" or content_encoding[0] == "" ): script = msg.response.content elif "gzip" in content_encoding[0].lower(): try: script = zlib.decompress(msg.response.content, zlib.MAX_WBITS | 16) except zlib.error as e: logger.error( "BROWSER %i: Received zlib error when trying to decompress gzipped javascript: %s" % (browser_params["crawl_id"], str(e)) ) return elif "deflate" in content_encoding[0].lower(): try: script = zlib.decompress(msg.response.content, -zlib.MAX_WBITS) except zlib.error as e: logger.error( "BROWSER %i: Received zlib error when trying to decompress deflated javascript: %s" % (browser_params["crawl_id"], str(e)) ) return else: logger.error( "BROWSER %i: Received Content-Encoding %s. Not supported by Firefox, skipping archive." % (browser_params["crawl_id"], str(content_encoding)) ) return ldb_socket.send(script) # Hash script for deduplication on disk hasher = pyhash.murmur3_x64_128() script_hash = str(hasher(script) >> 64) return script_hash
from pyhash import murmur3_x64_128 hasher = murmur3_x64_128() def single(): N = 10000 print 'i, h_i' for i in range(N): h = hasher(str(i)) % N print "{}, {}".format(i, h) def family(): N = 1000 print 'i, h_i, class' for i in range(N): h128 = hasher(str(i)) h64l = h128 & ((1L << 64) - 1) h64u = h128 >> 64 for j in range(5): h = (h64l + j*h64u) % N print "{}, {}, {}".format(i, h, j) def family_pairwise(): print 'h_i, h_j, class' for i in range(N): h128 = hasher(str(i)) h64l = h128 & ((1L << 64) - 1) h64u = h128 >> 64