Beispiel #1
0
    def __init__(self, size, num_values, num_hashes=None):
        """Simple implementation of a Bloom filter.

        It stores a bit array internally of @size bits and expects
        @num_values to be inserted.

        @size - how many bits the bitarray has. the larger the less the
        chance of a false positive

        @num_values - number of distinct values we expect to insert/query
        in the bloom filter

        @num_hashes - number of hash functions (optional). If not
        provided, it's calculated from @size and @num_values
        """
        self.size = size
        self.bitArr = bitarray(size)
        self.bitArr.setall(False)
        self.hasher = murmur3_x64_128()

        # Number of hash functions that minimizes the
        # probability of false positives
        if num_hashes is None:
            self.numHashes = max(5, int(log(2)*size/num_values))
        else:
            self.numHashes = num_hashes
Beispiel #2
0
    def __init__(self,
                 capacity,
                 error_rate=0.0001,
                 fname=None,
                 h1=pyhash.murmur3_x64_128(),
                 h2=pyhash.spooky_128()):
        """

        :param capacity: size of possible input elements
        :param error_rate: posi
        :param fname:
        :param h1:
        :param h2:
        """
        # calculate m & k
        self.capacity = capacity
        self.error_rate = error_rate
        self.num_of_bits, self.num_of_hashes = self._adjust_param(
            4096 * 8, error_rate)
        self._fname = fname
        self._data_store = MmapBitSet(self.num_of_bits)
        self._size = len(self._data_store)
        self._hashes = functools.partial(hashes,
                                         h1=h1,
                                         h2=h2,
                                         number=self.num_of_hashes)
Beispiel #3
0
    def __init__(self, size, num_values, num_hashes=None):
        """Simple implementation of a Bloom filter.

        It stores a bit array internally of @size bits and expects
        @num_values to be inserted.

        @size - how many bits the bitarray has. the larger the less the
        chance of a false positive

        @num_values - number of distinct values we expect to insert/query
        in the bloom filter

        @num_hashes - number of hash functions (optional). If not
        provided, it's calculated from @size and @num_values
        """
        self.size = size
        self.bitArr = bitarray(size)
        self.bitArr.setall(False)
        self.hasher = murmur3_x64_128()

        # Number of hash functions that minimizes the
        # probability of false positives
        if num_hashes is None:
            self.numHashes = max(5, int(log(2) * size / num_values))
        else:
            self.numHashes = num_hashes
Beispiel #4
0
def save_javascript_content(ldb_socket, logger, browser_params, msg):
    """ Save javascript files de-duplicated and compressed on disk """
    if not browser_params['save_javascript']:
        return

    # Check if this response is javascript content
    is_js = False
    if (len(msg.response.headers['Content-Type']) > 0 and
       'javascript' in msg.response.headers['Content-Type'][0]):
        is_js = True
    if not is_js and urlparse(msg.request.url).path.split('.')[-1] == 'js':
        is_js = True
    if not is_js:
        return

    # Decompress any content with compression
    # We want files to hash to the same value
    # Firefox currently only accepts gzip/deflate
    script = ''
    content_encoding = msg.response.headers['Content-Encoding']
    if (len(content_encoding) == 0 or
            content_encoding[0].lower() == 'utf-8' or
            content_encoding[0].lower() == 'identity' or
            content_encoding[0].lower() == 'none' or
            content_encoding[0].lower() == 'ansi_x3.4-1968' or
            content_encoding[0].lower() == 'utf8' or
            content_encoding[0] == ''):
        script = msg.response.content
    elif 'gzip' in content_encoding[0].lower():
        try:
            script = zlib.decompress(msg.response.content, zlib.MAX_WBITS|16)
        except zlib.error as e:
            logger.error('BROWSER %i: Received zlib error when trying to decompress gzipped javascript: %s' % (browser_params['crawl_id'],str(e)))
            return
    elif 'deflate' in content_encoding[0].lower():
        try:
            script = zlib.decompress(msg.response.content, -zlib.MAX_WBITS)
        except zlib.error as e:
            logger.error('BROWSER %i: Received zlib error when trying to decompress deflated javascript: %s' % (browser_params['crawl_id'],str(e)))
            return
    else:
        logger.error('BROWSER %i: Received Content-Encoding %s. Not supported by Firefox, skipping archive.' % (browser_params['crawl_id'], str(content_encoding)))
        return
    
    ldb_socket.send(script)
    
    # Hash script for deduplication on disk
    hasher = pyhash.murmur3_x64_128()
    script_hash = str(hasher(script) >> 64)
    
    return script_hash
Beispiel #5
0
def save_javascript_content(ldb_socket, logger, browser_params, msg):
    """ Save javascript files de-duplicated and compressed on disk """
    if not browser_params['save_javascript']:
        return

    # Check if this response is javascript content
    is_js = False
    if (len(msg.response.headers['Content-Type']) > 0 and
       'javascript' in msg.response.headers['Content-Type'][0]):
        is_js = True
    if not is_js and urlparse(msg.request.url).path.split('.')[-1] == 'js':
        is_js = True
    if not is_js:
        return

    # Decompress any content with compression
    # We want files to hash to the same value
    # Firefox currently only accepts gzip/deflate
    script = ''
    content_encoding = msg.response.headers['Content-Encoding']
    if (len(content_encoding) == 0 or
            content_encoding[0].lower() == 'utf-8' or
            content_encoding[0].lower() == 'identity' or
            content_encoding[0].lower() == 'none' or
            content_encoding[0].lower() == 'ansi_x3.4-1968' or
            content_encoding[0].lower() == 'utf8' or
            content_encoding[0] == ''):
        script = msg.response.content
    elif 'gzip' in content_encoding[0].lower():
        try:
            script = zlib.decompress(msg.response.content, zlib.MAX_WBITS|16)
        except zlib.error as e:
            logger.error('BROWSER %i: Received zlib error when trying to decompress gzipped javascript: %s' % (browser_params['crawl_id'],str(e)))
            return
    elif 'deflate' in content_encoding[0].lower():
        try:
            script = zlib.decompress(msg.response.content, -zlib.MAX_WBITS)
        except zlib.error as e:
            logger.error('BROWSER %i: Received zlib error when trying to decompress deflated javascript: %s' % (browser_params['crawl_id'],str(e)))
            return
    else:
        logger.error('BROWSER %i: Received Content-Encoding %s. Not supported by Firefox, skipping archive.' % (browser_params['crawl_id'], str(content_encoding)))
        return

    ldb_socket.send(script)

    # Hash script for deduplication on disk
    hasher = pyhash.murmur3_x64_128()
    script_hash = str(hasher(script) >> 64)

    return script_hash
Beispiel #6
0
def process_script(script, batch, db, counter, logger):
    """
    adds a script to the batch
    """
    # Hash script for deduplication on disk
    hasher = pyhash.murmur3_x64_128()
    script_hash = str(hasher(script) >> 64)

    if db.get(script_hash) is not None:
        return counter

    compressed_script = zlib.compress(script)

    batch.put(script_hash, compressed_script)
    return counter + 1
Beispiel #7
0
def process_script(script, batch, db, counter, logger):
    """
    adds a script to the batch
    """
    #if len(record) != 2:
    #    logger.error("Unsupported record (incorrect length): %s" % str(record))
    #    return
    #key = record[0]
    #value = record[1]

    # Hash script for deduplication on disk
    hasher = pyhash.murmur3_x64_128()
    script_hash = str(hasher(script) >> 64)

    if db.get(script_hash) is not None:
        return

    compressed_script = zlib.compress(script)

    batch.put(script_hash, compressed_script)
    counter += 1
Beispiel #8
0
    def __init__(self, size=65536, k=7, name='bf', load=False):
        if load:
            self.load(name)
        else:
            self.size = size
            if k > 18 or k <= 0:
                print('k should be > 0 & <= 18')
                return None
            self.k = k
            self.name = name
            self.bitarray = bitarray.bitarray('0' * self.size)
            self.tables = [[set() for j in range(self.size)]
                           for i in range(self.k)]

        self.hashes = [
            pyhash.fnv1_64(),
            pyhash.murmur2_x64_64a(),
            pyhash.murmur3_x64_128(),
            pyhash.lookup3(),
            pyhash.super_fast_hash(),
            pyhash.city_128(),
            pyhash.spooky_128(),
            pyhash.farm_128(),
            pyhash.metro_128(),
            pyhash.mum_64(),
            pyhash.t1_64(),
            pyhash.xx_64(),
            lambda str: int(hashlib.md5(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(hashlib.sha1(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha224(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha256(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha384(str.encode('utf-8')).hexdigest(), 16),
            lambda str: int(
                hashlib.sha512(str.encode('utf-8')).hexdigest(), 16)
        ]
Beispiel #9
0
def save_javascript_content(ldb_socket, logger, browser_params, msg):
    """ Save javascript files de-duplicated and compressed on disk """
    if not browser_params["save_javascript"]:
        return

    # Check if this response is javascript content
    is_js = False
    if len(msg.response.headers["Content-Type"]) > 0 and "javascript" in msg.response.headers["Content-Type"][0]:
        is_js = True
    if not is_js and urlparse(msg.request.url).path.split(".")[-1] == "js":
        is_js = True
    if not is_js:
        return

    # Decompress any content with compression
    # We want files to hash to the same value
    # Firefox currently only accepts gzip/deflate
    script = ""
    content_encoding = msg.response.headers["Content-Encoding"]
    if (
        len(content_encoding) == 0
        or content_encoding[0].lower() == "utf-8"
        or content_encoding[0].lower() == "identity"
        or content_encoding[0].lower() == "none"
        or content_encoding[0].lower() == "ansi_x3.4-1968"
        or content_encoding[0].lower() == "utf8"
        or content_encoding[0] == ""
    ):
        script = msg.response.content
    elif "gzip" in content_encoding[0].lower():
        try:
            script = zlib.decompress(msg.response.content, zlib.MAX_WBITS | 16)
        except zlib.error as e:
            logger.error(
                "BROWSER %i: Received zlib error when trying to decompress gzipped javascript: %s"
                % (browser_params["crawl_id"], str(e))
            )
            return
    elif "deflate" in content_encoding[0].lower():
        try:
            script = zlib.decompress(msg.response.content, -zlib.MAX_WBITS)
        except zlib.error as e:
            logger.error(
                "BROWSER %i: Received zlib error when trying to decompress deflated javascript: %s"
                % (browser_params["crawl_id"], str(e))
            )
            return
    else:
        logger.error(
            "BROWSER %i: Received Content-Encoding %s. Not supported by Firefox, skipping archive."
            % (browser_params["crawl_id"], str(content_encoding))
        )
        return

    ldb_socket.send(script)

    # Hash script for deduplication on disk
    hasher = pyhash.murmur3_x64_128()
    script_hash = str(hasher(script) >> 64)

    return script_hash
Beispiel #10
0
from pyhash import murmur3_x64_128

hasher = murmur3_x64_128()

def single():
    N = 10000
    print 'i, h_i'
    for i in range(N):
        h = hasher(str(i)) % N
        print "{}, {}".format(i, h)

def family():
    N = 1000
    print 'i, h_i, class'
    for i in range(N):
        h128 = hasher(str(i))
        h64l = h128 & ((1L << 64) - 1)
        h64u = h128 >> 64

        for j in range(5):
            h = (h64l + j*h64u) % N
            print "{}, {}, {}".format(i, h, j)

def family_pairwise():

    print 'h_i, h_j, class'
    for i in range(N):
        h128 = hasher(str(i))
        h64l = h128 & ((1L << 64) - 1)
        h64u = h128 >> 64