Example #1
2
def decode_lz4_old_kafka(buff):
    """Decode buff for 0.8/0.9 brokers

    Reference impl: https://github.com/dpkp/kafka-python/blob/a00f9ead161e8b05ac953b460950e42fa0e0b7d6/kafka/codec.py#L258
    """
    assert xxhash is not None
    # Kafka's LZ4 code has a bug in its header checksum implementation
    header_size = 7
    if isinstance(buff[4], int):
        flg = buff[4]
    else:
        flg = ord(buff[4])
    content_size_bit = ((flg >> 3) & 1)
    if content_size_bit:
        header_size += 8

    # This should be the correct hc
    hc = xxhash.xxh32(buff[4:header_size-1]).digest()[-2:-1]  # pylint: disable-msg=no-member

    munged_buff = b''.join([
        buff[0:header_size-1],
        hc,
        buff[header_size:]
    ])
    return decode_lz4(munged_buff)
def list_n_neighbours(g_seq, l_seq, blocks=1000, leaf_size=200, nn=10):
    g_aln = [x for x in g_seq.values()]
    l_aln = [x for x in l_seq.values()]
    genome_size = len(g_aln[0].seq)  ## only works for aligned sequences
    block_size = int(genome_size / blocks)
    if (block_size < 1): block_size = 1
    logger.info("Creating a hashed genome with blocks of %s bases",
                str(block_size))
    g_hash = [[
        xxhash.xxh32(str(g_aln[j].seq[i:i + block_size])).intdigest()
        for i in range(0, genome_size, block_size)
    ] for j in range(len(g_aln))]
    btre = BallTree(
        np.array(g_hash), leaf_size=leaf_size,
        metric='hamming')  # create a neighbours tree of global sequences

    logger.info("And finding %s closest neighbours", str(nn))
    if (nn < 2):
        logger.warning(
            "Closest neighbour will be itself, if already on BallTree; useful only on two independent sets"
        )

    l_hash = [[
        xxhash.xxh32(str(l_aln[j].seq[i:i + block_size])).intdigest()
        for i in range(0, genome_size, block_size)
    ] for j in range(len(l_aln))]
    dist, idx = btre.query(l_hash, k=nn,
                           return_distance=True)  # return_distance is free
    clusters = list(set([g_aln[j].id for x in idx for j in x
                         ]))  # one-dimentional list of all global neighbours
    del g_aln, g_hash, l_aln, l_hash, btre, idx
    return clusters
Example #3
0
def hashLocation(authTicket, latitude, longitude, altitude):
    baseHash = xxhash.xxh32(authTicket.SerializeToString(), seed=0x1B845238).intdigest()
    locationBytes = d2h(latitude) + d2h(longitude) + d2h(altitude)

    # Using serialized Auth Ticket
    hashA = xxhash.xxh32(locationBytes, seed=baseHash).intdigest()

    # Hash of location using static seed 0x1B845238
    hashB = xxhash.xxh32(locationBytes, seed=0x1B845238).intdigest()
    return hashA, hashB
Example #4
0
    def bb_hash(self):
        node_hash = xxhash.xxh32()
        relationship_hash = xxhash.xxh32()

        for disasm_text in self.bb.disassembly_text:
            if 'sub_' not in str(disasm_text):
                node_hash.update(str(disasm_text))

        relationship_hash.update(
            str(self.parent_func_uuid) + str(self.parent_bb) + str(self.UUID))

        return node_hash.intdigest(), relationship_hash.intdigest()
Example #5
0
def hashLocation(authTicket, latitude, longitude, altitude):
    baseHash = xxhash.xxh32(authTicket.SerializeToString(),
                            seed=0x1B845238).intdigest()

    # Format location
    locationBytes = d2h(latitude) + d2h(longitude) + d2h(altitude)

    # Using serialized Auth Ticket
    hashA = xxhash.xxh32(locationBytes, seed=baseHash).intdigest()

    # Hash of location using static seed 0x1B845238
    hashB = xxhash.xxh32(locationBytes, seed=0x1B845238).intdigest()
    return hashA, hashB
 def inCache(self, key, val, var_type):
     '''
     Check if variable is in cache by hashing 
     '''
     hsh = ''
     if var_type == np.ndarray:
         arr_bytes = bytes(val.data)
         hsh = xxhash.xxh32(arr_bytes).hexdigest() + str(val.shape)
     else:
         hsh = xxhash.xxh32(JSON.dumps(val).encode('utf8')).hexdigest()
     if key in self.cache and hsh == self.cache[key]:
         return True, hsh
     return False, hsh
Example #7
0
    def test_xxh32_overflow(self):
        a = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=0)
        b = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=2**32)
        self.assertEqual(a.seed, b.seed)
        self.assertEqual(a.intdigest(), b.intdigest())
        self.assertEqual(a.hexdigest(), b.hexdigest())
        self.assertEqual(a.digest(), b.digest())

        a = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=1)
        b = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=2**32 + 1)
        self.assertEqual(a.seed, b.seed)
        self.assertEqual(a.intdigest(), b.intdigest())
        self.assertEqual(a.hexdigest(), b.hexdigest())
        self.assertEqual(a.digest(), b.digest())

        a = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=2**33 - 1)
        b = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=2**34 - 1)
        self.assertEqual(a.seed, b.seed)
        self.assertEqual(a.intdigest(), b.intdigest())
        self.assertEqual(a.hexdigest(), b.hexdigest())
        self.assertEqual(a.digest(), b.digest())

        a = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=2**65 - 1)
        b = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=2**66 - 1)
        self.assertEqual(a.seed, b.seed)
        self.assertEqual(a.intdigest(), b.intdigest())
        self.assertEqual(a.hexdigest(), b.hexdigest())
        self.assertEqual(a.digest(), b.digest())
Example #8
0
    def perturb(self, x, p):

        if self.bern_ps[p]:
            #x_hash= (xxhash.xxh32(self.hash_cache[x], seed=p).intdigest()) % self.g
            pert_val = (xxhash.xxh32(self.hash_cache[x],
                                     seed=p).intdigest()) % self.g
        else:
            pert_val = self.uni_dist[p]
        dom_index = 0
        while dom_index < self.sz:
            if pert_val == (xxhash.xxh32(self.hash_cache[dom_index],
                                         seed=p).intdigest() % self.g):
                self.estimate[dom_index] += 1.0
            dom_index += 1
    def __init__(self, epsilon, d, k, g=2, use_olh=True, index_mapper=None, hash_matrix=None):
        """

        Args:
            epsilon: float - The privacy budget
            d: integer - Size of the data domain
            k: integer - The number of hash functions to use. Larger k results in a more accurate oracle at the expense of computation time.
            g: Optional float - The domain [g] = {1,2,...,g} that data is hashed to, 2 by default (binary local hashing)
            use_olh: Optional boolean - if set to true uses Optimised Local Hashing i.e g is set to round(e^epsilon + 1)
            index_mapper: Optional function - maps data items to indexes in the range {0, 1, ..., d-1} where d is the size of the data domain
            hash_matrix: Optional matrix - Allows the use of a pre-computed hash matrix that contains hashed domain elements
        """
        self.k = k
        super().__init__(epsilon, d, g, use_olh, index_mapper=index_mapper)
        self.hash_counts = np.zeros((self.k, self.g))

        # g = lambda i,j: xxhash.xxh32(str(int(j)), seed=int(i)).intdigest() % self.g

        if hash_matrix is None:
            matrix = np.empty((self.k, self.d))
            for i in range(0, self.k):
                for j in range(0, self.d):
                    matrix[i][j] = xxhash.xxh32(str(j), seed=i).intdigest() % self.g

            # self.hash_matrix = np.fromfunction(g, (self.k, self.d))
            self.hash_matrix = matrix
        else:
            self.hash_matrix = hash_matrix
Example #10
0
File: liblz4.py Project: etern/lz4
    def _compress_frame(self):
        '''
        frame contains all the blocks, plus frame header and checksum
        '''
        self.dst_file.write(self._frame_header())

        def read_src(buf):
            return self.src_file.readinto(buf)

        self.src_buffer = bytearray(b'\0') * BLOCK_SIZE
        self.dst_buffer = bytearray(
            b'\0') * worst_case_block_length(BLOCK_SIZE)

        xxh = xxhash.xxh32(seed=0)

        nbytes = read_src(self.src_buffer)
        while nbytes != 0:
            block_len = lz4_compress_block(
                self.dst_buffer, memoryview(self.src_buffer)[0:nbytes])
            self.dst_file.write(memoryview(self.dst_buffer)[0:block_len])
            # only pinned buffer, not appropriate here
            xxh.update(bytes(self.src_buffer[0:nbytes]))
            nbytes = read_src(self.src_buffer)

        self.dst_file.write((0).to_bytes(4, 'little'))  # EndMark
        self.dst_file.write(xxh.intdigest().to_bytes(4, 'little'))  # CheckSum
Example #11
0
def encode_lz4_old_kafka(buff):
    """Encode buff for 0.8/0.9 brokers -- requires an incorrect header checksum.

    Reference impl: https://github.com/dpkp/kafka-python/blob/a00f9ead161e8b05ac953b460950e42fa0e0b7d6/kafka/codec.py#L227
    """
    assert xxhash is not None
    data = encode_lz4(buff)
    header_size = 7
    flg = data[4]
    if not isinstance(flg, int):
        flg = ord(flg)

    content_size_bit = ((flg >> 3) & 1)
    if content_size_bit:
        # Old kafka does not accept the content-size field
        # so we need to discard it and reset the header flag
        flg -= 8
        data = bytearray(data)
        data[4] = flg
        data = bytes(data)
        buff = data[header_size + 8:]
    else:
        buff = data[header_size:]

    # This is the incorrect hc
    hc = xxhash.xxh32(data[0:header_size -
                           1]).digest()[-2:-1]  # pylint: disable-msg=no-member

    return b''.join([data[0:header_size - 1], hc, buff])
Example #12
0
def hash_netloc(netloc):

    m = xxhash.xxh32()
    m.update(netloc.encode("utf-8"))
    nlhash = m.intdigest()

    return nlhash
def get_hash(v, m, k):
    # using v, get k hashes in range [0,m]
    val = np.sum(v)
    this_hash = (
        np.array([xxhash.xxh32(str(val + i)).intdigest()
                  for i in range(k)]) - mn_hash) / (mx_hash - mn_hash)
    return this_hash * m
    def __init__(self, name, passk):
        self.name = name
        self.__serc_id = passk
        self.__key = xxhash.xxh32(str(self.name),
                                  seed=self.__serc_id).hexdigest()

        self.__dict = {}
Example #15
0
def get_modded_msyts(msg_sarc: sarc.SARC, lang: str = 'USen',
                     tmp_dir: Path = util.get_work_dir() / 'tmp_text') -> (list, dict):
    """
    Gets a list of modified game text files in a given message SARC

    :param msg_sarc: The message SARC to scan for changes.
    :type msg_sarc: class:`sarc.SARC`
    :param lang: The game language to use, defaults to USen.
    :type lang: str, optional
    :param tmp_dir: The temp directory to use, defaults to "tmp_text" in BCML's working directory.
    :type tmp_dir: class:`pathlib.Path`, optional
    :returns: Returns a tuple containing a list of modded text files and a dict of new text
    files with their contents.
    :rtype: (list of str, dict of str: bytes)
    """
    hashes = get_msbt_hashes(lang)
    modded_msyts = []
    added_msbts = {}
    write_msbts = []
    for msbt in msg_sarc.list_files():
        if any(exclusion in msbt for exclusion in EXCLUDE_TEXTS):
            continue
        m_data = msg_sarc.get_file_data(msbt)
        m_hash = xxhash.xxh32(m_data).hexdigest()
        if msbt not in hashes:
            added_msbts[msbt] = m_data
        elif m_hash != hashes[msbt]:
            write_msbts.append((tmp_dir / msbt, m_data.tobytes()))
            modded_msyts.append(msbt.replace('.msbt', '.msyt'))
    if write_msbts:
        pool = multiprocessing.Pool()
        pool.map(write_msbt, write_msbts)
        pool.close()
        pool.join()
    return modded_msyts, added_msbts
Example #16
0
def pickValAndProp(new_pos, sudoku, r, ste_Hsh):
    try:
        val = 0
        tried = set()
        x = xxhash.xxh32()
        noOfStates = len(ste_Hsh)
        i = 0
        if (new_pos == [-1, -1]):
            return (-1, -1)
        common = np.intersect1d(r[0][locateSquareOfPos([new_pos[0],new_pos[1]],sudoku)]\
                    ,np.intersect1d(r[2][new_pos[0]],r[1][new_pos[1]]))

        for val in common:
            sudoku[new_pos[0]][new_pos[1]] = val
            x.update(sudoku)
            ste_Hsh.add(x.digest())
            x.reset()
            if (noOfStates < len(ste_Hsh)):
                break
        if (noOfStates == len(ste_Hsh)):
            return (-1, -1)

        #remove from square
        r[0][locateSquareOfPos(new_pos, sudoku)].remove(val)
        #remove from col
        r[1][new_pos[1]].remove(val)
        #remove from row
        r[2][new_pos[0]].remove(val)
        #print("no of states",noOfStates)
        return (sudoku, r)
    except:
        return (-1, -1)
Example #17
0
    def _load_definition(self, uri=None, text=None, allow_imports=True):
        if not uri and not text:
            raise ValueError("A schema uri or text must be defined")
        elif uri and text:
            raise ValueError(
                "You cannot specify multiple sources. Choose one: uri or text."
            )

        if uri:
            if uri.startswith("http"):
                text = requests.get(uri).content
            else:
                uri = uri[len("file://"):] if uri.startswith(
                    "file://") else uri
                with open(uri) as schema_file:
                    text = schema_file.read()

        definition = yaml.safe_load(text)
        if allow_imports:
            self._add_imports(definition)

        metadata = definition.setdefault("__metadata__", {})
        if not metadata.get("schema_version", None):
            metadata["schema_version"] = xxhash.xxh32(text).hexdigest()

        return definition
Example #18
0
def listener(sok, monitor, rate_queue, timeout_params, progress,
             last_packet_num):
    timeout_interval = 0.5
    while (len(monitor)):
        progress = 100 - ((len(monitor) - 1) * 100 / last_packet_num + 1) + 1
        print("progress = {:.2f}%".format(progress))
        try:
            sok.settimeout(timeout_interval)
            message, clientAddress = sok.recvfrom(2048)
            # sequence_num = int.from_bytes(message[0:4], byteorder='big')
            first_byte = message[0]
            # discard packet if message type is 0(handshake)
            if first_byte >> 7 & 1 == 0:
                continue
            # if hash is not correct, discard ack
            if message[4:8] != xxhash.xxh32(message[0:4]).digest():
                continue
            else:
                sequence_num = (first_byte & int('7f', 16)).to_bytes(
                    1, byteorder='big') + message[1:4]
                sequence_num = int.from_bytes(sequence_num[0:4],
                                              byteorder='big')
            if sequence_num in monitor:

                # finding RTT
                sampleRTT = current_time() - monitor[sequence_num]
                if timeout_params.get('estRTT', -1) == -1:
                    timeout_params['devRTT'] = 0
                    timeout_params['estRTT'] = sampleRTT
                else:
                    timeout_params['estRTT'] = 0.125 * \
                        (sampleRTT) + 0.875*timeout_params['estRTT']
                    timeout_params['devRTT'] = 0.75 * \
                        timeout_params['devRTT'] + \
                        abs(timeout_params['estRTT']-sampleRTT)
                    timeout_params['interval'] = timeout_params['estRTT'] + \
                        4*timeout_params['devRTT']

                monitor.pop(sequence_num, -1)

                # rate_queue.pop(sequence_num, -1)
            # if rate_queue.get(sequence_num, -1) != -1:
            #     print("got ack for", sequence_num, "qlen", len(rate_queue),
            #           "RTT", current_time() - rate_queue[sequence_num])
            # else:
            # print("got ack for", sequence_num)
        except socket.timeout:
            print('will try after {} seconds'.format(timeout_interval))
            timeout_interval *= 1.5
        except ConnectionRefusedError:
            print('It seems the server is not online, sleeping for {} seconds'.
                  format(timeout_interval))
            time.sleep(timeout_interval)
            timeout_interval *= 1.5
        finally:
            if timeout_interval > 10:
                print('no response since 10 seconds. exiting')
                monitor['disconnected'] = True
                exit()
    sok.close()
Example #19
0
def show(pkg, filename):
    if os.path.exists(pkg) and os.path.isfile(pkg):
        fp = open(pkg, "r")
        fp.seek(3)
        count = readUInt32(fp)
        hashOff = readUInt32(fp)
        blockOff = readUInt32(fp)
        hashs = []
        blocks = []
        fp.seek(hashOff)
        for i in range(count):
            hashs.append(readUInt32(fp))
        fp.seek(blockOff)
        for i in range(count):
            off = readUInt32(fp)
            length = readUInt32(fp)
            blocks.append((off, length))
        print("blocks:")
        print(blocks)
        hashValue = xxhash.xxh32(filename, seed=0).intdigest()
        blk = blocks[hashs.index(hashValue)]
        fp.seek(blk[0])
        detail = fp.read(blk[1])
        print("file detail:%s" % detail.encode("hex"))
        flag = struct.unpack("=B", detail[0:1])[0]
        size = struct.unpack("=I", detail[1:5])[0]
        content = struct.unpack("={}s".format(blk[1] - 5), detail[5:])[0]
        decode = zlib.decompress(content)
        print(len(decode))
        return decode
Example #20
0
 def XQqwlHlXKK(self, e, i):
     r = []
     for o in range(16):
         r.append(92 ^ e[o])
     n = xxhash.xxh32(b'', seed=0)
     s = xxhash.xxh32(b'', seed=0)
     n.update(bytes(r))
     for o in range(16):
         r[o] ^= 106
     s.update(bytes(r))
     s.update(i)
     a = s.hexdigest()  # is b8a7c677?
     n.update(bytes(self.pmAWhahfKx(a)))
     c = n.hexdigest()  # is 3f97d2f6?
     d = self.pmAWhahfKx(c)
     return bytes(d)
def replace_values_metadata(df0):
    df = df0.copy()
    if "source_sex" in df.columns:
        df["source_sex"] = df["source_sex"].replace(
            ["Woman", "Female", "FEmale"], "F")
        df["source_sex"] = df["source_sex"].replace(["Male"], "M")
        df["source_sex"] = df["source_sex"].replace(
            ["Unknown", "unknwon", "U"], "?")
    if "adm2" in df.columns:
        df['adm2'] = df['adm2'].str.title()
        df["adm2"] = df["adm2"].replace(["Unknown Source", "Unknown"], "")
        #df["adm2"] = df["adm2"].replace({"Greater_London":"Greater London"}) # "Hertfordshire" != "Herefordshire"
        df["adm2"] = df["adm2"].replace(
            ["Greater_London"],
            "Greater London")  # "Hertfordshire" != "Herefordshire"
        df['adm2'] = df['adm2'].map(lambda x: x
                                    if str(x) == "Norfolk" else "code" + str(
                                        xxhash.xxh32(str(x)).hexdigest()[:3])
                                    )  ## no ADM2 leaves the servers
        #df['adm2'].fillna(df.country, inplace=True)
    if "is_icu_patient" in df.columns:
        df["is_icu_patient"] = df["is_icu_patient"].str.replace("Unknown", "?")

    #df["uk_lineage"] = df["uk_lineage"].replace(np.nan, "x", regex=True)
    return df
Example #22
0
def get_msbt_hashes(lang: str = 'USen') -> {}:
    """
    Gets the MSBT hash table for the given language, or US English by default

    :param lang: The game language to use, defaults to USen.
    :type lang: str, optional
    :returns: A dictionary of MSBT files and their vanilla hashes.
    :rtype: dict of str: str
    """
    if not hasattr(get_msbt_hashes, 'texthashes'):
        get_msbt_hashes.texthashes = {}
    if lang not in get_msbt_hashes.texthashes:
        hash_table = util.get_exec_dir() / 'data' / 'msyt' / \
            f'Msg_{lang}_hashes.csv'
        if hash_table.exists():
            get_msbt_hashes.texthashes[lang] = {}
            with hash_table.open('r') as h_file:
                csv_loop = csv.reader(h_file)
                for row in csv_loop:
                    get_msbt_hashes.texthashes[lang][row[0]] = row[1]
        elif util.get_game_file(f'Pack/Bootup_{lang}.pack').exists():
            get_msbt_hashes.texthashes[lang] = {}
            with util.get_game_file(f'Pack/Bootup_{lang}.pack').open(
                    'rb') as b_file:
                bootup_pack = sarc.read_file_and_make_sarc(b_file)
            msg_bytes = util.decompress(
                bootup_pack.get_file_data(
                    f'Message/Msg_{lang}.product.ssarc').tobytes())
            msg_pack = sarc.SARC(msg_bytes)
            for msbt in msg_pack.list_files():
                get_msbt_hashes.texthashes[lang][msbt] = xxhash.xxh32(
                    msg_pack.get_file_data(msbt)).hexdigest()
    return get_msbt_hashes.texthashes[lang]
Example #23
0
def encode_lz4_old_kafka(buff):
    """Encode buff for 0.8/0.9 brokers -- requires an incorrect header checksum.

    Reference impl: https://github.com/dpkp/kafka-python/blob/a00f9ead161e8b05ac953b460950e42fa0e0b7d6/kafka/codec.py#L227
    """
    assert xxhash is not None
    data = encode_lz4(buff)
    header_size = 7
    flg = data[4]
    if not isinstance(flg, int):
        flg = ord(flg)

    content_size_bit = ((flg >> 3) & 1)
    if content_size_bit:
        # Old kafka does not accept the content-size field
        # so we need to discard it and reset the header flag
        flg -= 8
        data = bytearray(data)
        data[4] = flg
        data = bytes(data)
        buff = data[header_size+8:]
    else:
        buff = data[header_size:]

    # This is the incorrect hc
    hc = xxhash.xxh32(data[0:header_size-1]).digest()[-2:-1]  # pylint: disable-msg=no-member

    return b''.join([
        data[0:header_size-1],
        hc,
        buff
    ])
Example #24
0
def lz4_encode_old_kafka(payload):
    """Encode payload for 0.8/0.9 brokers -- requires an incorrect header checksum."""
    assert xxhash is not None
    data = lz4_encode(payload)
    header_size = 7
    flg = data[4]
    if not isinstance(flg, int):
        flg = ord(flg)

    content_size_bit = ((flg >> 3) & 1)
    if content_size_bit:
        # Old kafka does not accept the content-size field
        # so we need to discard it and reset the header flag
        flg -= 8
        data = bytearray(data)
        data[4] = flg
        data = bytes(data)
        payload = data[header_size+8:]
    else:
        payload = data[header_size:]

    # This is the incorrect hc
    hc = xxhash.xxh32(data[0:header_size-1]).digest()[-2:-1]  # pylint: disable-msg=no-member

    return b''.join([
        data[0:header_size-1],
        hc,
        payload
    ])
Example #25
0
def threaded_compare_texts(msyt: Path,
                           tmp_dir: Path,
                           lang: str = 'USen') -> (str, dict):
    """Diffs texts in an MYST in a way suitable for multiprocessing"""
    rel_path = str(msyt.relative_to(tmp_dir)).replace('\\', '/')
    if lang in ['USen', 'EUen']:
        lang = 'XXen'
    try:
        with (tmp_dir / rel_path).open('r', encoding='utf-8') as mod_file:
            contents = mod_file.read()
            xhash = xxhash.xxh32(contents.encode('utf8')).hexdigest()
            if xhash == get_msyt_hashes()[lang][rel_path]:
                return rel_path, None
            import yaml.reader
            try:
                mod_text = yaml.safe_load(contents)
            except yaml.reader.ReaderError:
                err = ValueError(
                    f'A character in {rel_path} could not be read')
                err.error_text = (
                    f'A character in {rel_path} could not be read. This probably means that the MSBT '
                    'file is damaged or corrupt. You may need to report this to the mod\'s creator.'
                )
                raise err
    except FileNotFoundError:
        return rel_path, None
    text_edits = {'entries': {}}
    hashes = get_entry_hashes()[lang]
    for entry, text in mod_text['entries'].items():
        if entry not in hashes or hashes[entry] != get_entry_hash(
                text['contents']):
            text_edits['entries'][entry] = copy.deepcopy(text)
    return rel_path, text_edits
Example #26
0
def minhash_faster_but_less_random(string_set):
    hashers = [xxhash.xxh32(w.encode('utf8')) for w in string_set]
    hashes = np.asarray([h.intdigest() for h in hashers])
    while True:
        hashes *= 2654435761
        hashes %= 2 ** 32
        yield np.min(hashes)
Example #27
0
def organize_dir(initial_path, db):
    # list of the content of directory
    listdir = [initial_path + "/" + fd for fd in os.listdir(initial_path)]
    # recursivity in all subfolder
    print("+ Recursivity...")
    for i in [d for d in listdir if os.path.isdir(d)]:
        organize_dir(i, db)
    #get media
    print("+ Get media")
    media = [{'path': f, 'date': get_date(f), 'size': os.stat(f).st_size} for f in listdir if
             os.path.isfile(f) and f.__contains__('.') and
             f.split('.')[-1].lower() in IMAGE_EXTENSION]
    #hash and insert in db
    print("+ Insert into db")
    import xxhash
    for m in media:
        print(m['path'])
        #calculate hash
        with open(m['path'], 'rb') as afile:
            digest = str(xxhash.xxh32(afile.read()).hexdigest())
        if db.execute("select * from image where hash = ?", [digest]).fetchall().__len__() > 0:
            print("Already in db")
        db.execute("insert into image (path, year, month, day, hash, size) values (?,?,?,?,?,?) ;",
                   [m['path'], m['date'].year, m['date'].month, m['date'].day, digest, m['size']])
        print(str(m["date"]) + "  " + digest)
    db.commit()
Example #28
0
 def __init__(self, i, p, channels, display):
     seed = getrandbits(32)
     self.fn = xxhash.xxh32(seed=seed)
     #self.total = 0    #i think this line can be removed
     self.i = i
     self.threshold = math.pow(2, i)
     #create vectors of a,b,c values for each channel
     #sum{j*x_j}
     self.a = np.zeros(channels, dtype=int)
     #sum{x_j}
     self.b = np.zeros(channels, dtype=int)
     #sum{x_j*r^j mod p}
     self.c = np.zeros(channels, dtype=int)
     self.p = p
     #TODO: make sure this randomness works safely, and doesn't, say, give
     #the same output each time you make a new RIS object
     self.r = randint(1, p - 1)
     #keep track of whether channels have been checked. If not, sampling will fail
     self.queryable = [False for j in range(channels)]
     #additionally keep track of linear combos of channels you may have checked
     #and whether they were queryable.  If not, sampling will fail
     #this could get too big if the user is able to run check_linear_combo
     #for an arbitrary number of linear combinations of channels. That
     #might need to be fixed.
     self.linear_queryable = {}
     self.display = display
Example #29
0
    def extract_attribute(self, base_object: BDBasicBlock) -> Optional[Dict]:
        # Check if value already exists
        BasicBlockCallees_value = base_object.get_attribute_value(
            'BasicBlockCallees')

        if BasicBlockCallees_value:
            pass
        else:
            names_hash = xxhash.xxh32()
            bb_start = base_object.underlying_obj.start
            bb_end = base_object.underlying_obj.end

            for call_site in base_object.underlying_obj.function.call_sites:
                if call_site.address in range(bb_start, bb_end):
                    bv: BinaryView = base_object.underlying_obj.view
                    for callee in bv.get_callees(call_site.address):
                        callee_name: str = bv.get_function_at(callee).name
                        if callee_name and not callee_name.startswith('sub_'):
                            names_hash.update(callee_name)

            BasicBlockCallees_value = {
                'callee_names_hash': names_hash.intdigest()
            }
            base_object.add_attribute_value('BasicBlockCallees',
                                            BasicBlockCallees_value)

            if names_hash.intdigest() == 0:
                log.log_debug(
                    f'BasicBlockCallees: No names to extract, names_hash is 0')

        return BasicBlockCallees_value if BasicBlockCallees_value else None
Example #30
0
    def _perturb(self, data, seed):
        """
        Used internally to perturb data using local hashing.

        Will hash the user's data item and then peturb it with probabilities that
        satisfy epsilon local differential privacy. Local hashing is explained
        in more detail here: https://www.usenix.org/system/files/conference/usenixsecurity17/sec17-wang-tianhao.pdf

        Args:
            data: User's data to be privatised
            seed: The seed for the user's hash function

        Returns: peturbed data

        """
        index = self.index_mapper(data)

        # Taken directly from Wang (https://github.com/vvv214/LDP_Protocols/blob/master/olh.py#L55-L65)
        x = (xxhash.xxh32(str(index), seed=seed).intdigest() % self.g)
        y = x

        p_sample = np.random.random_sample()
        # the following two are equivalent
        # if p_sample > p:
        #     while not y == x:
        #         y = np.random.randint(0, g)
        if p_sample > self.p - self.q:
            # perturb
            y = np.random.randint(0, self.g)

        return y
Example #31
0
def content_id_text(text, partial=False):

    # 1. Normalize (drop whitespace)
    text = text_normalize(text, keep_ws=False)

    # 2. Create 13 character n-grams
    ngrams = ("\u0020".join(l) for l in sliding_window(text, WINDOW_SIZE_CID_T))

    # 3. Create 32-bit features with xxHash32
    features = (xxhash.xxh32(s.encode("utf-8")).intdigest() for s in ngrams)

    # 4. Apply minimum_hash
    minhash = minimum_hash(features, n=64)

    # 5. Collect least significant bits of first 64 minhash signatures
    lsb = "".join([str(x & 1) for x in minhash])

    # 6. Create 64-bit digests
    digest = int(lsb, 2).to_bytes(8, "big", signed=False)

    # 7. Prepend component header
    if partial:
        content_id_text_digest = HEAD_CID_T_PCF + digest
    else:
        content_id_text_digest = HEAD_CID_T + digest

    # 8. Encode and return
    return encode(content_id_text_digest)
Example #32
0
def lz4_encode_old_kafka(payload):
    """Encode payload for 0.8/0.9 brokers -- requires an incorrect header checksum."""
    assert xxhash is not None
    data = lz4_encode(payload)
    header_size = 7
    flg = data[4]
    if not isinstance(flg, int):
        flg = ord(flg)

    content_size_bit = ((flg >> 3) & 1)
    if content_size_bit:
        # Old kafka does not accept the content-size field
        # so we need to discard it and reset the header flag
        flg -= 8
        data = bytearray(data)
        data[4] = flg
        data = bytes(data)
        payload = data[header_size+8:]
    else:
        payload = data[header_size:]

    # This is the incorrect hc
    hc = xxhash.xxh32(data[0:header_size-1]).digest()[-2:-1]  # pylint: disable-msg=no-member

    return b''.join([
        data[0:header_size-1],
        hc,
        payload
    ])
Example #33
0
 def uid_shard(self, uid):
     try:
         uid_hash = xxhash.xxh32(str(uid), seed=101).intdigest()
     except:
         return -1
     shard_idx = uid_hash % len(self.stub_list)
     return shard_idx
Example #34
0
def find_or_load_beats(filename: str, loader: bm.beats.loader.BeatLoader) -> bm.Beats:
    h = xxhash.xxh32()
    with open(filename, "rb") as file:
        block = file.read(512)
        while block:
            h.update(block)
            block = file.read(512)
    d = h.digest()

    if d in song_cache:
        logger.info(f"Cache: hit {d}")
        try:
            return pickle.load(open(song_cache[d], "rb"))
        except Exception as e:
            logger.exception(f"Cache: failed to load {d}, falling through to miss", e)

    logger.info(f"Cache: miss {d}, creating...")
    beats = bm.Beats.from_song(filename, beat_loader=loader)
    beat_filename = f"{filename}_beats.pkl"

    with open(beat_filename, "wb") as fp:
        logger.info(f"Cache: creating entry {d} at {beat_filename}")
        pickle.dump(beats, fp)

    song_cache[d] = beat_filename

    return beats
    def extract_attribute(self, base_object: BDFunction) -> Optional[Dict]:
        # Check if value already exists
        FunctionStringReferences_value = base_object.get_attribute_value(
            'FunctionStringReferences')

        if FunctionStringReferences_value:
            pass

        else:
            strings_hash = xxhash.xxh32()
            current_function: Function = base_object.underlying_obj

            for addr in range(current_function.lowest_address,
                              current_function.highest_address):
                const_refs = current_function.get_constants_referenced_by(addr)
                for ref in const_refs:
                    string = current_function.view.get_string_at(ref.value)
                    if string:
                        strings_hash.update(string.value.encode('utf8'))

            FunctionStringReferences_value = {
                'strings_hash': strings_hash.intdigest()
            }

            base_object.add_attribute_value('FunctionStringReferences',
                                            FunctionStringReferences_value)

        return FunctionStringReferences_value if FunctionStringReferences_value else None
Example #36
0
def generate_hashes(frequency_idx, time_idx):
    """ ピークのインデックスを使ってハッシュを計算する """
    # 次のピークとのハッシュを返す
    # 時間差の条件あり

    time_frequency = []
    for f_idx, t_idx in zip(frequency_idx, time_idx):
        time_frequency.append([t_idx, f_idx])
    # 時間順、周波数順にする
    time_frequency.sort()

    peak_len = len(time_frequency)
    for i in range(peak_len):
        for j in range(1, DETECT_PARAMETER.FAN_VALUE):
            if (i + j) < peak_len:
                freq1 = time_frequency[i][Frequency_idx]
                freq2 = time_frequency[i + j][Frequency_idx]
                t1 = time_frequency[i][Time_idx]
                t2 = time_frequency[i + j][Time_idx]
                time_delta = t2 - t1

                if time_delta <= DETECT_PARAMETER.MAX_HASH_TIME_DELTA:
                    h = xxhash.xxh32("%s|%s|%s" %
                                     (str(freq1), str(freq2), str(time_delta)))
                    yield (h.hexdigest(), t1)
Example #37
0
def hash_function(shingle, function_id):
    try:
        return xxhash.xxh32(shingle.encode("utf8") * function_id).intdigest()
    except Exception, e:
        print e
        print shingle
        sys.exit(-1)
Example #38
0
 def _hash(self, item):
     # get Python hash ID of object
     # technique used by Rafa Carrascosa
     # https://github.com/rafacarrascosa/countminsketch
     h = xxhash.xxh32(str(hash(item)))
     for i in range(self.num_rows):
         h.update(str(i))
         yield h.intdigest() % self.num_columns
Example #39
0
    def test_XXH32_reset(self):
        x = xxhash.xxh32()
        h = x.intdigest()

        for i in range(10, 50):
            x.update(os.urandom(i))

        x.reset()

        self.assertEqual(h, x.intdigest())
Example #40
0
def _hash_with_seed(funcname, seed):
    seed = xxhash.xxh32(seed).intdigest()

    xxh32 = xxhash.xxh32
    spooky32 = spooky.hash32

    if funcname == 'xxhash32':
        return lambda x: xxh32(x, seed=seed).intdigest()
    elif funcname == 'spooky32':
        return lambda x: spooky32(x, seed=seed)
    else:
        raise ValueError('Unknown function name: %s' % funcname)
Example #41
0
File: liblz4.py Project: etern/lz4
    def _parse_header(self):
        # IMPORTANT: for simplicity, lz4 configuration is not fully supported
        buf = self.src_file.read(7)
        if len(buf) != 7 or int.from_bytes(buf[0:4], 'little') != MAGIC_NUMBER:
            raise BadFileError

        if buf[4] != int('01100100', 2):  # FLG
            raise BadFileError

        if buf[5] != int('01110000', 2):  # BD
            raise BadFileError

        checksum = xxhash.xxh32(buf[4:6], seed=0).digest()[2]
        if checksum != buf[6]:
            raise BadFileError
Example #42
0
    def test_XXH32(self):
        x = xxhash.xxh32()
        x.update('a')
        self.assertEqual(xxhash.xxh32('a').digest(), x.digest())
        x.update('b')
        self.assertEqual(xxhash.xxh32('ab').digest(), x.digest())
        x.update('c')
        self.assertEqual(xxhash.xxh32('abc').digest(), x.digest())

        seed = random.randint(0, 2**32)
        x = xxhash.xxh32(seed=seed)
        x.update('a')
        self.assertEqual(xxhash.xxh32('a', seed).digest(), x.digest())
        x.update('b')
        self.assertEqual(xxhash.xxh32('ab', seed).digest(), x.digest())
        x.update('c')
        self.assertEqual(xxhash.xxh32('abc', seed).digest(), x.digest())
Example #43
0
File: liblz4.py Project: etern/lz4
 def _frame_header(self):
     header = bytearray()
     header += MAGIC_NUMBER.to_bytes(4, 'little')
     # default frame descriptor FLG, Version Number 01
     # Block Independenc 1, Block Checksum 0
     # Content Size 0, Content Checksum 1
     FD_FLG = int('01100100', 2)
     # frame descriptor BD
     # Block Max Size 7 -> 4M
     FD_BD = int('01110000', 2)
     # frame descriptor header checksum
     checksum = xxhash.xxh32(bytes([FD_FLG, FD_BD]), seed=0).digest()
     FD_HC = checksum[2]
     header.append(FD_FLG)
     header.append(FD_BD)
     header.append(FD_HC)
     return header
Example #44
0
def thread_affinity(url, total_worker_count):
	'''
	Ensure only one client ever works on each netloc.
	This maintains better consistency of user-agents
	'''

	# Only limit netlocs if we actually need to.
	if not getModuleForUrl(url).single_thread_fetch(url):
		return True

	netloc = urllib.parse.urlsplit(url).netloc

	m = xxhash.xxh32()
	m.update(netloc.encode("utf-8"))

	nlhash = m.intdigest()
	thread_aff = nlhash % total_worker_count
	# print("Thread affinity:", self.total_worker_count, self.worker_num, thread_aff, self.worker_num == thread_aff)
	return thread_aff
Example #45
0
def lz4_encode(payload):
    data = lz4f.compressFrame(payload)  # pylint: disable-msg=no-member
    # Kafka's LZ4 code has a bug in its header checksum implementation
    header_size = 7
    if isinstance(data[4], int):
        flg = data[4]
    else:
        flg = ord(data[4])
    content_size_bit = ((flg >> 3) & 1)
    if content_size_bit:
        header_size += 8

    # This is the incorrect hc
    hc = xxhash.xxh32(data[0:header_size-1]).digest()[-2:-1]  # pylint: disable-msg=no-member

    return b''.join([
        data[0:header_size-1],
        hc,
        data[header_size:]
    ])
Example #46
0
def lz4_decode_old_kafka(payload):
    # Kafka's LZ4 code has a bug in its header checksum implementation
    header_size = 7
    if isinstance(payload[4], int):
        flg = payload[4]
    else:
        flg = ord(payload[4])
    content_size_bit = ((flg >> 3) & 1)
    if content_size_bit:
        header_size += 8

    # This should be the correct hc
    hc = xxhash.xxh32(payload[4:header_size-1]).digest()[-2:-1]  # pylint: disable-msg=no-member

    munged_payload = b''.join([
        payload[0:header_size-1],
        hc,
        payload[header_size:]
    ])
    return lz4_decode(munged_payload)
Example #47
0
def lz4_encode_old_kafka(payload):
    """Encode payload for 0.8/0.9 brokers -- requires an incorrect header checksum."""
    data = lz4_encode(payload)
    header_size = 7
    if isinstance(data[4], int):
        flg = data[4]
    else:
        flg = ord(data[4])
    content_size_bit = ((flg >> 3) & 1)
    if content_size_bit:
        header_size += 8

    # This is the incorrect hc
    hc = xxhash.xxh32(data[0:header_size-1]).digest()[-2:-1]  # pylint: disable-msg=no-member

    return b''.join([
        data[0:header_size-1],
        hc,
        data[header_size:]
    ])
Example #48
0
File: liblz4.py Project: etern/lz4
    def _extract_frame(self):
        self._parse_header()
        xxh = xxhash.xxh32(seed=0)

        while True:
            buf = self.src_file.read(4)
            block_len = int.from_bytes(buf, 'little')
            if block_len == 0:  # end mark
                break
            buf = self.src_file.read(block_len)
            if len(buf) != block_len:
                raise BadFileError
            restored_block = bytearray()
            lz4_decompress_sequences(buf, restored_block)
            self.dst_file.write(restored_block)
            # only pinned buffer, not appropriate here
            xxh.update(bytes(restored_block))

        buf = self.src_file.read(4)
        # xxh.digest will give a big endian result
        if int.from_bytes(buf, 'little') != xxh.intdigest():
            raise BadFileError
Example #49
0
def lz4_decode(payload):
    # Kafka's LZ4 code has a bug in its header checksum implementation
    header_size = 7
    if isinstance(payload[4], int):
        flg = payload[4]
    else:
        flg = ord(payload[4])
    content_size_bit = ((flg >> 3) & 1)
    if content_size_bit:
        header_size += 8

    # This should be the correct hc
    hc = xxhash.xxh32(payload[4:header_size-1]).digest()[-2:-1]  # pylint: disable-msg=no-member

    munged_payload = b''.join([
        payload[0:header_size-1],
        hc,
        payload[header_size:]
    ])

    cCtx = lz4f.createCompContext()  # pylint: disable-msg=no-member
    data = lz4f.decompressFrame(munged_payload, cCtx)  # pylint: disable-msg=no-member
    return data['decomp']
    def hashdirectory(self,directory,map):
        hashfunc = xxhash.xxh32()
        for file in os.listdir(directory):
            if(os.path.isdir(os.path.join(directory,file))):
                #print os.path.join(directory,file)
                key = self.hashdirectory(os.path.join(directory,file),map)
                if key in map:
                    map[key] = map[key] + "?"+os.path.join(directory,file)
                else:
                    map[key] = os.path.join(directory,file)
                hashfunc.update(key)
            if(os.path.isfile(os.path.join(directory,file))):
                hf = xxhash.xxh64()
                f = open(os.path.join(directory,file),'rb').read()
                byts = bytes(f)
                #mem = memoryview(byts)
                buffersize = 1048576
                bytesize = sys.getsizeof(byts)
                self.ldb.pgb.step(bytesize/1024)
                if bytesize-buffersize>0:
                    for i in range(0,bytesize-buffersize,buffersize):
                        if bytesize-i>buffersize:
                            hf.update(byts[i:(i+buffersize)])
                        else:
                            hf.update(byts[i:])
                else:
                    hf.update(byts[0:])

                key = hf.digest()
                if key in map:
                    map[key] = map[key] + "?"+os.path.join(directory,file)
                else:
                    map[key] = os.path.join(directory,file)
                hashfunc.update(key)
        key = hashfunc.digest()
        return key
Example #51
0
    chars=string.letters + string.digits
    return ''.join([choice(chars) for i in range(length)])


def brokers_for_key(servers, key32bit):
    # (2**0) | 2**1 | 2**2 | 2**3 | 2**4 | 2**5 | 2**6 | 2**7 | 2**8 | 2**9 | 2**10
    k1 = key32bit & 2047 # lower 10 bits
    # (2**11) | 2**12 | 2**13 | 2**14 | 2**15 | 2**16 | 2**17 | 2**18 | 2**19 | 2**20 | 2**21
    k2 = key32bit & 4192256
    # (2**22) | 2**23 | 2**24 | 2**25 | 2**26 | 2**27 | 2**28 | 2**29 | 2**30 | 2**31 | 2**32
    k3 = key32bit & 8585740288
    # print "Keys: ", (k1,k2,k3)
    server_size = len(servers)
    return (servers[k1 % server_size],
            servers[k2 % server_size],
            servers[k3 % server_size])

def any_brokers_equals(s1,s2,s3):
    return s1 == s2 or s1 == s3 or s2 == s3;

# start with 100 servers
cluster_servers = ips(10)
# print "servers: ", cluster_servers
for i in range(0, 1000):
    producer_key = key()
    rpc_chain_path = xxhash.xxh32(producer_key).intdigest()
    server_selection = brokers_for_key(cluster_servers, rpc_chain_path)
    (s1,s2,s3) = server_selection
    if any_brokers_equals(s1,s2,s3):
        print "Found a collision: ", (s1, s2, s3), ",  rpc_chain: ", rpc_chain_path, ", producer key: ", producer_key
Example #52
0
 def test_xxh32(self):
     self.assertEqual(xxhash.xxh32('a').intdigest(), 1426945110)
     self.assertEqual(xxhash.xxh32('a', 0).intdigest(), 1426945110)
     self.assertEqual(xxhash.xxh32('a', 1).intdigest(), 4111757423)
Example #53
0
def generate_location_hash_by_seed(authticket, lat, lng, acc=5):
    first_hash = xxhash.xxh32(authticket, seed=HASH_SEED).intdigest()
    location_bytes = d2h(lat) + d2h(lng) + d2h(acc)
    loc_hash = xxhash.xxh32(location_bytes, seed=first_hash).intdigest()
    return ctypes.c_int32(loc_hash).value
Example #54
0
def generateLocation2(lat, lng, alt):
    locationBytes = d2h(lat) + d2h(lng) + d2h(alt)
    if not alt:
        alt = "\x00\x00\x00\x00\x00\x00\x00\x00"
    return xxhash.xxh32(locationBytes, seed=0x1B845238).intdigest()      #Hash of location using static seed 0x1B845238
Example #55
0
def generateLocation1(authticket, lat, lng, alt): 
    firstHash = xxhash.xxh32(authticket, seed=0x1B845238).intdigest()
    locationBytes = d2h(lat) + d2h(lng) + d2h(alt)
    if not alt:
        alt = "\x00\x00\x00\x00\x00\x00\x00\x00"
    return xxhash.xxh32(locationBytes, seed=firstHash).intdigest()
Example #56
0
def generate_location_hash(lat, lng, acc=5):
    location_bytes = d2h(lat) + d2h(lng) + d2h(acc)
    loc_hash = xxhash.xxh32(location_bytes, seed=HASH_SEED).intdigest()
    return ctypes.c_int32(loc_hash).value
Example #57
0
def minhash(string_set):
    hashers = [xxhash.xxh32(w.encode('utf8')) for w in string_set]
    while True:
        yield min(h.intdigest() for h in hashers)
        for h in hashers:
            h.update('.')
Example #58
0
def generateLocation2(lat, lng, alt):
    locationBytes = d2h(lat) + d2h(lng) + d2h(alt)
    return xxhash.xxh32(locationBytes, seed=static_seed).intdigest()
Example #59
0
def generateLocation1(authticket, lat, lng, alt):
    firstHash = xxhash.xxh32(authticket, seed=static_seed).intdigest()
    locationBytes = d2h(lat) + d2h(lng) + d2h(alt)
    return xxhash.xxh32(locationBytes, seed=firstHash).intdigest()