def _read_bucket(self, doc, column_set, column_dtypes, include_symbol, include_images, columns): rtn = {} if doc[VERSION] != 3: raise ArcticException("Unhandled document version: %s" % doc[VERSION]) # np.cumsum copies the read-only array created with frombuffer rtn[INDEX] = np.cumsum( np.frombuffer(lz4_decompress(doc[INDEX]), dtype='uint64')) doc_length = len(rtn[INDEX]) column_set.update(doc[COLUMNS].keys()) # get the mask for the columns we're about to load union_mask = np.zeros((doc_length + 7) // 8, dtype='uint8') for c in column_set: try: coldata = doc[COLUMNS][c] # the or below will make a copy of this read-only array mask = np.frombuffer(lz4_decompress(coldata[ROWMASK]), dtype='uint8') union_mask = union_mask | mask except KeyError: rtn[c] = None union_mask = np.unpackbits(union_mask)[:doc_length].astype('bool') rtn_length = np.sum(union_mask) rtn[INDEX] = rtn[INDEX][union_mask] if include_symbol: rtn['SYMBOL'] = [ doc[SYMBOL], ] * rtn_length # Unpack each requested column in turn for c in column_set: try: coldata = doc[COLUMNS][c] dtype = np.dtype(coldata[DTYPE]) # values ends up being copied by pandas before being returned to the user. However, we # copy it into a bytearray here for safety. values = np.frombuffer(bytearray(lz4_decompress( coldata[DATA])), dtype=dtype) self._set_or_promote_dtype(column_dtypes, c, dtype) rtn[c] = self._empty(rtn_length, dtype=column_dtypes[c]) # unpackbits will make a copy of the read-only array created by frombuffer rowmask = np.unpackbits( np.frombuffer(lz4_decompress(coldata[ROWMASK]), dtype='uint8'))[:doc_length].astype('bool') rowmask = rowmask[union_mask] rtn[c][rowmask] = values except KeyError: rtn[c] = None if include_images and doc.get(IMAGE_DOC, {}).get(IMAGE, {}): rtn = self._prepend_image(rtn, doc[IMAGE_DOC], rtn_length, column_dtypes, column_set, columns) return rtn
async def get_master(res_ver, to_path): manifest = await read_manifest(res_ver, "Android", "High", "High") if not manifest: return None cur = manifest.execute("SELECT hash, attr FROM manifests WHERE name = ?", ("master.mdb", )) hash, attr = cur.fetchone() manifest.close() url = SQLBASEURL.format(hash, hash[0:2]) cl = httpclient.AsyncHTTPClient() try: mashttp = await cl.fetch(url, headers=extra_acquisition_headers()) except Exception as e: print("get_master: unhandled error while getting master:", e) return None buf = mashttp.buffer.read() bio = io.BytesIO() bio.write(buf[4:8]) bio.write(buf[16:]) data = lz4_decompress(bio.getvalue()) with open(to_path, "wb") as write_db: write_db.write(data) mdate = mashttp.headers.get("Last-Modified") if mdate: tt = parsedate_tz(mdate) mtime = mktime_tz(tt) if tt else int(time()) else: mtime = int(time.time()) os.utime(to_path, (-1, mtime)) return to_path
def decompress(self, source, cursor, compressedbytes, uncompressedbytes=None): if self.algo == uproot.const.kZLIB: from zlib import decompress as zlib_decompress return zlib_decompress(cursor.bytes(source, compressedbytes)) elif self.algo == uproot.const.kLZMA: try: from lzma import decompress as lzma_decompress except ImportError: try: from backports.lzma import decompress as lzma_decompress except ImportError: raise ImportError("Install lzma package with:\n pip install backports.lzma\nor\n conda install -c conda-forge backports.lzma\n(or just use Python >= 3.3).") return lzma_decompress(cursor.bytes(source, compressedbytes)) elif self.algo == uproot.const.kOldCompressionAlgo: raise NotImplementedError("ROOT's \"old\" algorithm (fCompress 300) is not supported") elif self.algo == uproot.const.kLZ4: try: from lz4.block import decompress as lz4_decompress except ImportError: raise ImportError("Install lz4 package with:\n pip install lz4\nor\n conda install -c anaconda lz4") if uncompressedbytes is None: raise ValueError("lz4 needs to know the uncompressed number of bytes") return lz4_decompress(cursor.bytes(source, compressedbytes), uncompressed_size=uncompressedbytes) else: raise ValueError("unrecognized compression algorithm: {0}".format(self.algo))
def _read_bucket(self, doc, column_set, column_dtypes, include_symbol, include_images, columns): rtn = {} if doc[VERSION] != 3: raise ArcticException("Unhandled document version: %s" % doc[VERSION]) # np.cumsum copies the read-only array created with frombuffer rtn[INDEX] = np.cumsum(np.frombuffer(lz4_decompress(doc[INDEX]), dtype='uint64')) doc_length = len(rtn[INDEX]) column_set.update(doc[COLUMNS].keys()) # get the mask for the columns we're about to load union_mask = np.zeros((doc_length + 7) // 8, dtype='uint8') for c in column_set: try: coldata = doc[COLUMNS][c] # the or below will make a copy of this read-only array mask = np.frombuffer(lz4_decompress(coldata[ROWMASK]), dtype='uint8') union_mask = union_mask | mask except KeyError: rtn[c] = None union_mask = np.unpackbits(union_mask)[:doc_length].astype('bool') rtn_length = np.sum(union_mask) rtn[INDEX] = rtn[INDEX][union_mask] if include_symbol: rtn['SYMBOL'] = [doc[SYMBOL], ] * rtn_length # Unpack each requested column in turn for c in column_set: try: coldata = doc[COLUMNS][c] dtype = np.dtype(coldata[DTYPE]) # values ends up being copied by pandas before being returned to the user. However, we # copy it into a bytearray here for safety. values = np.frombuffer(bytearray(lz4_decompress(coldata[DATA])), dtype=dtype) self._set_or_promote_dtype(column_dtypes, c, dtype) rtn[c] = self._empty(rtn_length, dtype=column_dtypes[c]) # unpackbits will make a copy of the read-only array created by frombuffer rowmask = np.unpackbits(np.frombuffer(lz4_decompress(coldata[ROWMASK]), dtype='uint8'))[:doc_length].astype('bool') rowmask = rowmask[union_mask] rtn[c][rowmask] = values except KeyError: rtn[c] = None if include_images and doc.get(IMAGE_DOC, {}).get(IMAGE, {}): rtn = self._prepend_image(rtn, doc[IMAGE_DOC], rtn_length, column_dtypes, column_set, columns) return rtn
def decompress_array(str_list): """ Decompress a list of strings """ if not str_list: return str_list if not ENABLE_PARALLEL or len(str_list) <= LZ4_N_PARALLEL: return [lz4_decompress(chunk) for chunk in str_list] return _compress_thread_pool.map(lz4_decompress, str_list)
def decompress_array(str_list): """ Decompress a list of strings """ global _compress_thread_pool if not str_list: return str_list if not ENABLE_PARALLEL or len(str_list) <= LZ4_N_PARALLEL: return [lz4_decompress(chunk) for chunk in str_list] if _compress_thread_pool is None: _compress_thread_pool = ThreadPool(LZ4_WORKERS) return _compress_thread_pool.map(lz4_decompress, str_list)
def decompress_array(str_list): """ Decompress a list of strings """ global _compress_thread_pool if not str_list: return str_list if not ENABLE_PARALLEL or len(str_list) <= LZ4_N_PARALLEL: return [lz4_decompress(chunk) for chunk in str_list] if _compress_thread_pool is None: _compress_thread_pool = ThreadPool(LZ4_WORKERS) return _compress_thread_pool.map(lz4_decompress, str_list)
def decode(self) -> MappingsBuilder: try: header = self.read_nullterm() except BinaryMappingsError as e: raise BinaryMappingsError("Invalid header!") from e.__cause__ if header != "SuperSrg binary mappings": raise BinaryMappingsError(f"Unexpected header: {header}") version = self.read_u32() if version != 1: raise BinaryMappingsError(f"Unexpected version: {version}") compression = self.read_string() if compression == "": # Continue to treat uncompressed data as-is pass elif compression == "lz4-block": if lz4_decompress is None: raise BinaryMappingsError(f"Missing lz4 compression module!") decompressed = lz4_decompress(self.data_view[index:]) self.data = decompressed self.data_view = memoryview(decompressed) self.index = 0 elif compression in ("lzma2", "gzip"): raise BinaryMappingsError(f"Unsupported compression: {compression}") else: raise BinaryMappingsError(f"Forbidden compression: {compression}") builder = MappingsBuilder() num_classes = self.read_u64() for _ in range(num_classes): original_class = JavaClass(self.read_string()) revised_class_name = self.read_string() revised_class = JavaClass(revised_class_name) if revised_class_name else original_class num_methods = self.read_u32() for _ in range(num_methods): original_name = self.read_string() revised_name = self.read_string() original_signature = MethodSignature.parse(self.read_string()) self.read_string() # Ignore the revised signature original_data = MethodData(original_class, original_name, original_signature) builder.method_names[original_data] = intern(revised_name) num_fields = self.read_u32() for _ in range(num_fields): original_name = self.read_string() revised_name = self.read_string() original_data = FieldData(original_class, original_name) assert original_name != revised_name, f"Redundant field: {original_data}" builder.field_names[original_data] = intern(revised_name) return builder
def test_performance_sequential(n, length): _str = random_string(length) _strarr = [_str for _ in range(n)] now = dt.now() [c.decompress(y) for y in [c.compressHC(x) for x in _strarr]] clz4_time = (dt.now() - now).total_seconds() now = dt.now() c.decompress_array(c.compressHC_array(_strarr)) clz4_time_p = (dt.now() - now).total_seconds() now = dt.now() [lz4_decompress(y) for y in [lz4_compress(x) for x in _strarr]] lz4_time = (dt.now() - now).total_seconds() print() print("LZ4 Test %sx len:%s" % (n, length)) print(" LZ4 HC %s s" % clz4_time) print(" LZ4 HC Parallel %s s" % clz4_time_p) print(" LZ4 %s s" % lz4_time)
def test_performance_sequential(n, length): _str = random_string(length) _strarr = [_str for _ in range(n)] now = dt.now() [c.decompress(y) for y in [c.compressHC(x) for x in _strarr]] clz4_time = (dt.now() - now).total_seconds() now = dt.now() c.decompress_array(c.compressHC_array(_strarr)) clz4_time_p = (dt.now() - now).total_seconds() now = dt.now() [lz4_decompress(y) for y in [lz4_compress(x) for x in _strarr]] lz4_time = (dt.now() - now).total_seconds() print() print("LZ4 Test %sx len:%s" % (n, length)) print(" LZ4 HC %s s" % clz4_time) print(" LZ4 HC Parallel %s s" % clz4_time_p) print(" LZ4 %s s" % lz4_time)
def _decompressfcn(compression, objlen, debug=False): algo, level = compression if algo == "zlib": # skip 9-byte header for ROOT's custom frame: # https://github.com/root-project/root/blob/master/core/zip/src/Bits.h#L646 if debug: def out(x): print("decompressing {0} bytes".format(len(x) - 9)) return zlib_decompress(x[9:]) return out else: return lambda x: zlib_decompress(x[9:]) elif algo == "lzma": # skip 9-byte header for LZMA, too: # https://github.com/root-project/root/blob/master/core/lzma/src/ZipLZMA.c#L81 if debug: def out(x): print("decompressing {0} bytes".format(len(x) - 9)) return lzma_decompress(x[9:]) return out else: return lambda x: lzma_decompress(x[9:]) elif algo == "lz4": # skip 9-byte header plus 8-byte hash: are there any official ROOT versions without the hash? # https://github.com/root-project/root/blob/master/core/lz4/src/ZipLZ4.cxx#L38 if debug: def out(x): print("decompressing {0} bytes".format(len(x) - 9 - 8)) return lz4_decompress(x[9 + 8:], uncompressed_size=objlen) return out else: return lambda x: lz4_decompress(x[9 + 8:], uncompressed_size=objlen) else: raise NotImplementedError("cannot decompress \"{0}\"".format(algo))
async def acquire_manifest(version, platform, asset_qual, sound_qual, dest_file): cl = httpclient.AsyncHTTPClient() meta = "/".join((DBMANIFEST.format(version), "all_dbmanifest")) try: meta = await cl.fetch(meta, headers=extra_acquisition_headers()) except Exception as e: print("acquire_manifest: unhandled error while getting meta:", e) return None m = meta.body.decode("utf8") mp = map(lambda x: manifest_selector_t(*x.split(",")), filter(bool, m.split("\n"))) get_file = None for selector in mp: if selector.platform == platform and \ selector.asset_qual == asset_qual and \ selector.sound_qual == sound_qual: get_file = selector.filename break else: print("No candidate found for", platform, asset_qual, sound_qual) return None abso = "/".join((DBMANIFEST.format(version), get_file)) try: mani = await cl.fetch(abso, headers=extra_acquisition_headers()) except Exception as e: print("acquire_manifest: unhandled error while getting meta:", e) return None buf = mani.buffer.read() bio = io.BytesIO() bio.write(buf[4:8]) bio.write(buf[16:]) data = lz4_decompress(bio.getvalue()) with open(dest_file, "wb") as write_db: write_db.write(data) return dest_file
def mozlz4_decompress(data): if len(data) < 8 or data[:8] != b'mozLz40\0': raise Exception('Invalid mozlz4 header') return lz4_decompress(data[8:])
def decompress(_str): """ Decompress a string """ return lz4_decompress(_str)
def out(x): print("decompressing {0} bytes".format(len(x) - 9 - 8)) return lz4_decompress(x[9 + 8:], uncompressed_size=objlen)
def decompress(_str): """ Decompress a string """ return lz4_decompress(_str)