def hash_to_mem(file, print_progress=False): block_num = 0 dict = {} list = [] start_time = time.time() with open(file, "rb") as f: # Read first block block = f.read(block_size) while block != b"": # Compute hash of read block # hex_dig = hashlib.sha256(block).hexdigest() hex_dig = xxhash.xxh64_intdigest(block, 20181217) # Read next block block = f.read(block_size) # Store the block hash to the dict (if unique) and also to the list of blocks (always) if not hex_dig in dict: dict[hex_dig] = block_num list.append(hex_dig) block_num += 1 # Print progress if enabled if print_progress and block_num % 1000 == 0: print('.', end='', flush=True) if print_progress: print() print(" Hashing speed: %8.3f MB/s" % (block_size * block_num / 1024 / (time.time() - start_time + 0.1) / 1024) ) print(" Hashing took: %d:%02d" % divmod(time.time() - start_time, 60) ) return {'dict': dict, 'list': list}
def diff_msyt(msyt: Path, hashes: dict, mod_out: Path, ref_dir: Path): diff = {} filename = msyt.relative_to(mod_out).as_posix() if any(ex in filename for ex in EXCLUDE_TEXTS): msyt.unlink() return {} data = msyt.read_bytes() xxh = xxhash.xxh64_intdigest(data) if filename in hashes and hashes[filename] == xxh: pass else: text = data.decode("utf8") if filename not in hashes: diff[filename] = json.loads(text)["entries"] else: ref_text = (ref_dir / filename).read_text("utf-8") if "".join(text.split()) != "".join(ref_text.split()): ref_contents = json.loads(ref_text) contents = json.loads(text) diff[filename] = { entry: value for entry, value in contents["entries"].items() if (entry not in ref_contents["entries"] or value != ref_contents["entries"][entry]) } else: pass del ref_text del text msyt.unlink() del data return diff
def getnextblock(self) -> int: """Get a block of random bits from the random generator. Tip: Calling this method will advance the sequence exactly one time. The resulting index depends on whether or not the call occurs within a cascade. Returns: A block of :attr:`BLOCK_SIZE_BITS` random bits as an int """ # Note that the current index is used as the hash seed, and # the hash input (generated by hashing the sequence's seed) # remains constant over the run of the sequence. This greatly # simplifies index generation across platforms, and, based on # testing, has no adverse effects on value distribution. result = xxhash.xxh64_intdigest(self._hash_input, self._index) if self._cascading: # Normally, the index is incremented after each block. # When cascading, generated blocks are fed forward to use # in subsequent blocks; hence, single indices cascade across # multiple calls to getnextblock(). self._index = result else: self._index += 1 return result
def hash(self) -> int: """ Cached property containing the xxhash of the file :return: """ with open(self.path, "rb") as f: return xxhash.xxh64_intdigest(f.read())
def _pack_sarc(folder: Path, tmp_dir: Path, hashes: dict): packed = oead.SarcWriter( endian=oead.Endianness.Big if util.get_settings("wiiu") else oead.Endianness.Little ) try: canon = util.get_canon_name( folder.relative_to(tmp_dir).as_posix(), allow_no_source=True ) if canon not in hashes: raise FileNotFoundError("File not in game dump") stock_file = util.get_game_file(folder.relative_to(tmp_dir)) try: old_sarc = oead.Sarc(util.unyaz_if_needed(stock_file.read_bytes())) except (RuntimeError, ValueError, oead.InvalidDataError): raise ValueError("Cannot open file from game dump") old_files = {f.name for f in old_sarc.get_files()} except (FileNotFoundError, ValueError): for file in {f for f in folder.rglob("**/*") if f.is_file()}: packed.files[file.relative_to(folder).as_posix()] = file.read_bytes() else: for file in { f for f in folder.rglob("**/*") if f.is_file() and not f.suffix in EXCLUDE_EXTS }: file_data = file.read_bytes() xhash = xxhash.xxh64_intdigest(util.unyaz_if_needed(file_data)) file_name = file.relative_to(folder).as_posix() if file_name in old_files: old_hash = xxhash.xxh64_intdigest( util.unyaz_if_needed(old_sarc.get_file(file_name).data) ) if file_name not in old_files or (xhash != old_hash): packed.files[file_name] = file_data finally: shutil.rmtree(folder) if not packed.files: return # pylint: disable=lost-exception sarc_bytes = packed.write()[1] folder.write_bytes( util.compress(sarc_bytes) if (folder.suffix.startswith(".s") and not folder.suffix == ".sarc") else sarc_bytes )
def is_known(self, p): """Check a path, return True if it is known""" h = xxh64_intdigest(p) if h in self.unknown: self._add_known(h, p) return True return h in self.known
def get_classless_hash(self) -> int: return cast( int, xxhash.xxh64_intdigest( json.dumps( { slot: getattr(self, slot) for slot in self.__slots__ if slot != 'versionclass' }, sort_keys=True))) & 0x7fffffffffffffff
def check_iter(self, paths): """Check paths from an iterable""" # failsafe for common dumb error if isinstance(paths, str): raise TypeError("expected iterable of strings, got a string") unknown = self.unknown for p in paths: h = xxh64_intdigest(p) if h in unknown: self._add_known(h, p)
def is_savedata_modded(savedata: oead.Sarc) -> {}: hashes = get_savedata_hashes() sv_files = sorted(savedata.get_files(), key=lambda file: file.name) fix_slash = "/" if not sv_files[0].name.startswith("/") else "" modded = False for svdata in sv_files[0:-2]: svdata_hash = xxhash.xxh64_intdigest(svdata.data) if not modded: modded = (fix_slash + svdata.name not in hashes or svdata_hash != hashes[fix_slash + svdata.name]) return modded
def is_file_modded(name: str, file: Union[bytes, Path], count_new: bool = True) -> bool: contents = (file if isinstance(file, bytes) else file.read_bytes() if isinstance(file, Path) else bytes(file)) if contents[0:4] == b"Yaz0": contents = decompress(contents) table = get_hash_table(get_settings("wiiu")) if name not in table: return count_new fhash = xxhash.xxh64_intdigest(contents) return not fhash in table[name]
def get_resource(self, key: str) -> Tuple[str, int]: """Return resource for key Args: key Returns: name of resource, bucket """ k = xxh64_intdigest(key, self.seed) b = self.anchor.get_bucket(k) s = self.M[b] return s, b
def is_file_modded(name: str, file: Union[bytes, Path], count_new: bool = True) -> bool: table = get_hash_table(get_settings("wiiu")) if name not in table: return count_new contents = (file if isinstance(file, bytes) else file.read_bytes() if isinstance(file, Path) else bytes(file)) if contents[0:4] == b"Yaz0": try: contents = decompress(contents) except RuntimeError as err: raise ValueError(f"Invalid yaz0 file {name}") from err fhash = xxhash.xxh64_intdigest(contents) return not fhash in table[name]
def __init__(self, number: int, ruledata: dict[str, Any]) -> None: self.names = None self.namepat = None self.rulesets = None self.norulesets = None self.number = number self.pretty = str(ruledata) self.texthash = xxhash.xxh64_intdigest(self.pretty) self._matchers = [] self._actions = [] # handle substitution of final name in name matchers if 'name' in ruledata: self.names = yaml_as_list(ruledata['name']) if 'setname' in ruledata: self.names = [ DOLLAR0.sub(ruledata['setname'], name) for name in self.names ] ruledata['name'] = self.names if 'namepat' in ruledata: self.namepat = ruledata['namepat'].replace('\n', '') if 'setname' in ruledata: self.namepat = DOLLAR0.sub(ruledata['setname'], self.namepat) ruledata['namepat'] = self.namepat if 'ruleset' in ruledata: self.rulesets = yaml_as_set(ruledata['ruleset']) if 'noruleset' in ruledata: self.norulesets = yaml_as_set(ruledata['noruleset']) # matchers for keyword, generate_matcher in get_matcher_generators(): if keyword in ruledata: self._matchers.append(generate_matcher(ruledata)) # actions for keyword, generate_action in get_action_generators(): if keyword in ruledata: self._actions.append(generate_action(ruledata))
def __init__(self, key, n): self.key = key self.n = n # Figure out sizes needed for array for t in 'BHILQ': if array.array(t).itemsize * 8 > np.log(n) / np.log(2): typecode = t break y = Integer(1) table = [array.array(typecode) for _ in range(2**16)] for x in range(n): h = xxh64_intdigest(str(y)) % 2**16 table[h].append(x) y = Integer(y) * Integer(4) % key.p if x % 1000000 == 0: print(x) self.table = table
def get_bucket(self, k: int) -> int: """Calculates bucket for key :param k: key, assumed to be uniform (already hashed) :return: assigned bucket """ # uncomment next line if key not already hashed # k = xxh64_intdigest(bin(k), k) b = k % self.M while self.A[b] > 0: # b is removed # next line is like random(seed=k,b) # could instead use: k = int(0xFFFFFFFFFFFFFFFF & (k * 2862933555777941757 + 1)) k = xxh64_intdigest(bin(k) + bin(b), k) h = k % self.A[b] while self.A[h] >= self.A[b]: # b removed prior to h h = self.K[h] b = h return b
def create_list(modpath: str): modpath = path.normpath(modpath) entries = {} for filepath in iglob(f"{modpath}/*"): if path.isfile(filepath): try: name = path.splitext(path.basename(filepath))[0] key = int(name, 16) entries[key] = ModEntry.create(filepath, key) except ValueError: pass for filepath in iglob(f"{modpath}/*/**/*", recursive=True): if path.isfile(filepath): relpath = path.relpath(filepath, modpath).lower().replace('\\', '/') key = xxh64_intdigest(relpath) entries[key] = ModEntry.create(filepath, key) return entries
def test_xxh64_overflow(self): s = 'I want an unsigned 64-bit seed!' a = xxhash.xxh64(s, seed=0) b = xxhash.xxh64(s, seed=2**64) self.assertEqual(a.seed, b.seed) self.assertEqual(a.intdigest(), b.intdigest()) self.assertEqual(a.hexdigest(), b.hexdigest()) self.assertEqual(a.digest(), b.digest()) self.assertEqual(a.intdigest(), xxhash.xxh64_intdigest(s, seed=0)) self.assertEqual(a.intdigest(), xxhash.xxh64_intdigest(s, seed=2**64)) self.assertEqual(a.digest(), xxhash.xxh64_digest(s, seed=0)) self.assertEqual(a.digest(), xxhash.xxh64_digest(s, seed=2**64)) self.assertEqual(a.hexdigest(), xxhash.xxh64_hexdigest(s, seed=0)) self.assertEqual(a.hexdigest(), xxhash.xxh64_hexdigest(s, seed=2**64)) a = xxhash.xxh64(s, seed=1) b = xxhash.xxh64(s, seed=2**64 + 1) self.assertEqual(a.seed, b.seed) self.assertEqual(a.intdigest(), b.intdigest()) self.assertEqual(a.hexdigest(), b.hexdigest()) self.assertEqual(a.digest(), b.digest()) self.assertEqual(a.intdigest(), xxhash.xxh64_intdigest(s, seed=1)) self.assertEqual(a.intdigest(), xxhash.xxh64_intdigest(s, seed=2**64 + 1)) self.assertEqual(a.digest(), xxhash.xxh64_digest(s, seed=1)) self.assertEqual(a.digest(), xxhash.xxh64_digest(s, seed=2**64 + 1)) self.assertEqual(a.hexdigest(), xxhash.xxh64_hexdigest(s, seed=1)) self.assertEqual(a.hexdigest(), xxhash.xxh64_hexdigest(s, seed=2**64 + 1)) a = xxhash.xxh64(s, seed=2**65 - 1) b = xxhash.xxh64(s, seed=2**66 - 1) self.assertEqual(a.seed, b.seed) self.assertEqual(a.intdigest(), b.intdigest()) self.assertEqual(a.hexdigest(), b.hexdigest()) self.assertEqual(a.digest(), b.digest()) self.assertEqual(a.intdigest(), xxhash.xxh64_intdigest(s, seed=2**65 - 1)) self.assertEqual(a.intdigest(), xxhash.xxh64_intdigest(s, seed=2**66 - 1)) self.assertEqual(a.digest(), xxhash.xxh64_digest(s, seed=2**65 - 1)) self.assertEqual(a.digest(), xxhash.xxh64_digest(s, seed=2**66 - 1)) self.assertEqual(a.hexdigest(), xxhash.xxh64_hexdigest(s, seed=2**65 - 1)) self.assertEqual(a.hexdigest(), xxhash.xxh64_hexdigest(s, seed=2**66 - 1))
def data_iterator(): fields = line.decode('gbk').encode('utf-8').strip('\r\n').split( ' ') if len(fields) != 32: yield None label_ctr = int(fields[0]) slots = [] feature_fields = fields[1:] for i in range(0, len(fea_sections)): slot = [] describe = fea_sections[i]['fea_des'] fea_type = fea_sections[i]['fea_type'] size = int(fea_sections[i]['max_sz']) value_list = feature_fields[i].split(',') if fea_type in ['sparse']: for value in value_list: # why do hashing here? hashing should incorporate slotid slot.append(xxhash.xxh64_intdigest(value) % size) if len(slot) == 0: slot.append(0) slots.append(slot) slots.append([label_ctr]) yield zip(self.slot_name, slots)
def __init__(self, key, n): self.key = key self.n = n # Figure out sizes needed for array for t in 'BHILQ': if array.array(t).itemsize * 8 > np.log(n) / np.log(2): typecode = t break y = Integer(1) table = [array.array(typecode) for _ in range(2**16)] for x in range(n): h = xxh64_intdigest(str(y)) % 2**16 table[h].append(x) y = Integer(y) * Integer(4) % key.p if x % 1000000 == 0: print(x) # Figure out equivalent numpy sizes s = array.array(typecode).itemsize * 8 if s <= 8: nptype = np.uint8 elif s <= 16: nptype = np.uint16 elif s <= 32: nptype = np.uint32 elif s <= 64: nptype = np.uint64 else: raise TypeError("No numpy type large enough to hold array") maxtable_length = max([len(t) for t in table]) table_of_nps = [] for t in table: t_np = np.array(t, dtype=nptype) t_np.resize(maxtable_length) table_of_nps.append(t_np) self.table = np.asarray(table_of_nps)
def check(self, p): """Check a single hash, print and add to known on match""" h = xxh64_intdigest(p) if h in self.unknown: self._add_known(h, p)
def compute_hash(cls, s): return xxh64_intdigest(s.lower())
def xxhash_u64_v1(): # 100 loops, best of 100: 700 usec per loop for m1, m2 in prepare(): i = xxhash.xxh64_intdigest(TEXT[m1.end():m2.start()])
def __hashing_shingles(self, shingles): return [xxhash.xxh64_intdigest(shingle) for shingle in shingles]
def dlog4(self, q): for x in self.table[xxh64_intdigest(str(q)) % 2**16]: if pow(Integer(4), int(x), self.key.p) == q: return x
def get_savedata_hashes() -> {}: savedata = get_stock_savedata() return { file.name: xxhash.xxh64_intdigest(file.data) for file in savedata.get_files() }
def get_gamedata_hashes() -> Dict[str, int]: gamedata = get_stock_gamedata() return { file.name: xxhash.xxh64_intdigest(file.data) for file in gamedata.get_files() }
def murmur(x): return np.uint64(xxhash.xxh64_intdigest(x))
def compute_hash_int64(value): n = xxh64_intdigest(value) return (n ^ 0x8000000000000000) - 0x8000000000000000
def key_to_hash(key): if isinstance(key, str): return xxh64_intdigest(key.lower()) & 0xffffffffff else: return key
def hash_output(dat_array): x = xxhash.xxh64_intdigest(dat_array.data.tobytes(), seed=0) # print(x) # hashOutput = hash(dat_array.data.tobytes()) return x
static_table = [line.split() for line in static_table_raw.splitlines()] static_table = filter(lambda entry: len(entry) >= 2, static_table) static_table = sorted(static_table, key=lambda x: x[1]) cases = "" for header, entry in itertools.groupby(static_table, lambda x: x[1]): entry = list(entry) indices = [nested[0] for nested in entry] values = [str.join(" ", nested[2:]) for nested in entry if len(nested) > 2] if len(values) == 0: index = entry[0][0] cases += no_values_template.format(xxhash.xxh64_intdigest(header), header, index) else: value_cases = "" for index, value in zip(indices, values): value_cases += value_template.format(xxhash.xxh64_intdigest(value), value, index) cases += values_template.format(xxhash.xxh64_intdigest(header), header, value_cases, indices[0]) find_index = find_index_template.format(cases) encode_generated_template = """\ #include <bnl/http3/header.hpp>