def test_precompute_compress_rawcontent(self): d = zstd.ZstdCompressionDict(b'dictcontent' * 64, dict_type=zstd.DICT_TYPE_RAWCONTENT) d.precompute_compress(level=1) d = zstd.ZstdCompressionDict(b'dictcontent' * 64, dict_type=zstd.DICT_TYPE_FULLDICT) with self.assertRaisesRegexp(zstd.ZstdError, 'unable to precompute dictionary'): d.precompute_compress(level=1)
def test_bad_subsequent_input(self): initial = zstd.ZstdCompressor().compress(b'foo' * 64) dctx = zstd.ZstdDecompressor() with self.assertRaisesRegexp(ValueError, 'chunk 1 must be bytes'): dctx.decompress_content_dict_chain([initial, u'foo']) with self.assertRaisesRegexp(ValueError, 'chunk 1 must be bytes'): dctx.decompress_content_dict_chain([initial, None]) with self.assertRaisesRegexp( ValueError, 'chunk 1 is too small to contain a zstd frame'): dctx.decompress_content_dict_chain([initial, zstd.FRAME_HEADER]) with self.assertRaisesRegexp(ValueError, 'chunk 1 is not a valid zstd frame'): dctx.decompress_content_dict_chain([initial, b'foo' * 8]) no_size = zstd.ZstdCompressor(write_content_size=False).compress( b'foo' * 64) with self.assertRaisesRegexp(ValueError, 'chunk 1 missing content size in frame'): dctx.decompress_content_dict_chain([initial, no_size]) # Corrupt second frame. cctx = zstd.ZstdCompressor(dict_data=zstd.ZstdCompressionDict(b'foo' * 64)) frame = cctx.compress(b'bar' * 64) frame = frame[0:12] + frame[15:] with self.assertRaisesRegexp(zstd.ZstdError, 'chunk 1 did not decompress full frame'): dctx.decompress_content_dict_chain([initial, frame])
def __init__(self, *, config: Config, name: str, module_configuration: ConfigDict) -> None: super().__init__(config=config, name=name, module_configuration=module_configuration) self.level: str = Config.get_from_dict( module_configuration, 'level', types=int, check_func=lambda v: v >= 1 and v <= zstandard. MAX_COMPRESSION_LEVEL, check_message='Option level must be between 1 and {} (inclusive)'. format(zstandard.MAX_COMPRESSION_LEVEL)) dict_data_file: str = Config.get_from_dict(module_configuration, 'dictDataFile', None, types=str) if dict_data_file: with open(dict_data_file, 'rb') as f: dict_data_content = f.read() self._dict_data = zstandard.ZstdCompressionDict( dict_data_content, dict_type=zstandard.DICT_TYPE_FULLDICT) self._dict_data.precompute_compress(self.level) else: self._dict_data = None self._local = threading.local()
def compress_content_dict_compress(chunks, zparams): zstd.ZstdCompressor(compression_params=zparams).compress(chunks[0]) for i, chunk in enumerate(chunks[1:]): d = zstd.ZstdCompressionDict(chunks[i]) zstd.ZstdCompressor(dict_data=d, compression_params=zparams).compress( chunk )
def _from_existing_db(db, complevel: int = 6) -> JsonBlobCache: readonly = db.flags().get('readonly') try: db_info = db.open_db(b'info', create=False) except lmdb.NotFoundError: raise ValueError('Existing database is not a ds cache') with db.begin(db_info, write=False) as tr: version = tr.get(b'version', None) if version is None: raise ValueError('Missing format version field') if version != FORMAT_VERSION: raise ValueError("Unsupported on disk version: " + version.decode('utf8')) zdict = tr.get(b'zdict', None) dbs = SimpleNamespace(main=db, info=db_info, groups=db.open_db(b'groups', create=False), ds=db.open_db(b'ds', create=False), udata=db.open_db(b'udata', create=False)) comp_params = { 'dict_data': zstandard.ZstdCompressionDict(zdict) } if zdict else {} comp = None if readonly else zstandard.ZstdCompressor(level=complevel, **comp_params) decomp = zstandard.ZstdDecompressor(**comp_params) state = SimpleNamespace(dbs=dbs, comp=comp, decomp=decomp) return JsonBlobCache(state)
def test_data_equivalence(self, original, threads, use_dict): kwargs = {} if use_dict: kwargs['dict_data'] = zstd.ZstdCompressionDict(original[0]) cctx = zstd.ZstdCompressor(level=1, write_content_size=True, write_checksum=True, **kwargs) frames_buffer = cctx.multi_compress_to_buffer(original, threads=-1) dctx = zstd.ZstdDecompressor(**kwargs) result = dctx.multi_decompress_to_buffer(frames_buffer) self.assertEqual(len(result), len(original)) for i, frame in enumerate(result): self.assertEqual(frame.tobytes(), original[i]) frames_list = [f.tobytes() for f in frames_buffer] result = dctx.multi_decompress_to_buffer(frames_list) self.assertEqual(len(result), len(original)) for i, frame in enumerate(result): self.assertEqual(frame.tobytes(), original[i])
def _from_empty_db(db, complevel=6, zdict=None): assert isinstance(zdict, (bytes, type(None))) db_info = db.open_db(b'info', create=True) with db.begin(db_info, write=True) as tr: tr.put(b'version', FORMAT_VERSION) if zdict is not None: tr.put(b'zdict', zdict) dbs = SimpleNamespace(main=db, info=db_info, groups=db.open_db(b'groups', create=True), ds=db.open_db(b'ds', create=True), udata=db.open_db(b'udata', create=True)) comp_params = { 'dict_data': zstandard.ZstdCompressionDict(zdict) } if zdict else {} comp = zstandard.ZstdCompressor(level=complevel, **comp_params) decomp = zstandard.ZstdDecompressor(**comp_params) state = SimpleNamespace(dbs=dbs, comp=comp, decomp=decomp, products={}) return DatasetCache(state)
def _from_empty_db(db, complevel: int = 6, zdict: Optional[bytes] = None): assert isinstance(zdict, (bytes, type(None))) db_info = db.open_db(b"info", create=True) with db.begin(db_info, write=True) as tr: tr.put(b"version", FORMAT_VERSION) if zdict is not None: tr.put(b"zdict", zdict) dbs = SimpleNamespace( main=db, info=db_info, groups=db.open_db(b"groups", create=True), ds=db.open_db(b"ds", create=True), udata=db.open_db(b"udata", create=True), ) comp_params = { "dict_data": zstandard.ZstdCompressionDict(zdict) } if zdict else {} comp = zstandard.ZstdCompressor(level=complevel, **comp_params) decomp = zstandard.ZstdDecompressor(**comp_params) state = SimpleNamespace(dbs=dbs, comp=comp, decomp=decomp) return JsonBlobCache(state)
def decompress_content_dict_read_to_iter(chunks, opts): zctx = zstd.ZstdDecompressor(**opts) last = b"".join(zctx.read_to_iter(chunks[0])) for chunk in chunks[1:]: d = zstd.ZstdCompressionDict(last) zctx = zstd.ZstdDecompressor(dict_data=d, **opts) last = b"".join(zctx.read_to_iter(chunk))
def decompress_content_dict_decompressobj(chunks, opts): zctx = zstd.ZstdDecompressor(**opts) last = zctx.decompressobj().decompress(chunks[0]) for chunk in chunks[1:]: d = zstd.ZstdCompressionDict(last) zctx = zstd.ZstdDecompressor(dict_data=d, **opts) last = zctx.decompressobj().decompress(chunk)
def algo_to_fun(algo, comp_level, dictionary): if algo == 'brotli': return brotli.compress, {'quality': comp_level}, 'br' elif algo == 'zstd': if dictionary: with open(dictionary, 'r') as f: dict_data = zstd.ZstdCompressionDict(f.read()) cctx = zstd.ZstdCompressor(level=comp_level, dict_data=dict_data) else: cctx = zstd.ZstdCompressor(level=comp_level) return cctx.compress, {}, 'zst'
def compress_content_dict_read_to_iter(chunks, zparams, use_size=False): zctx = zstd.ZstdCompressor(compression_params=zparams) size = len(chunks[0]) if use_size else -1 for o in zctx.read_to_iter(chunks[0], size=size): pass for i, chunk in enumerate(chunks[1:]): d = zstd.ZstdCompressionDict(chunks[i]) zctx = zstd.ZstdCompressor(dict_data=d, compression_params=zparams) size = len(chunk) if use_size else -1 for o in zctx.read_to_iter(chunk, size=size): pass
def compress_content_dict_compressobj(chunks, zparams, use_size=False): zctx = zstd.ZstdCompressor(compression_params=zparams) cobj = zctx.compressobj(size=len(chunks[0]) if use_size else -1) cobj.compress(chunks[0]) cobj.flush() for i, chunk in enumerate(chunks[1:]): d = zstd.ZstdCompressionDict(chunks[i]) zctx = zstd.ZstdCompressor(dict_data=d, compression_params=zparams) cobj = zctx.compressobj(len(chunk) if use_size else -1) cobj.compress(chunk) cobj.flush()
def compress_content_dict_write_to(chunks, opts, use_size=False): zctx = zstd.ZstdCompressor(**opts) b = bio() with zctx.write_to(b, size=len(chunks[0]) if use_size else 0) as compressor: compressor.write(chunks[0]) for i, chunk in enumerate(chunks[1:]): d = zstd.ZstdCompressionDict(chunks[i]) b = bio() zctx = zstd.ZstdCompressor(dict_data=d, **opts) with zctx.write_to(b, size=len(chunk) if use_size else 0) as compressor: compressor.write(chunk)
def main(): zdict = r"custom_zstd_dict" with open(zdict, "rb") as f: zdict = f.read() zdict = zstd.ZstdCompressionDict(zdict) zc = zstd.ZstdCompressor(level=12, dict_data=zdict, threads=6, # write_content_size=False, write_checksum=True, write_dict_id=False) zd = zstd.ZstdDecompressor(dict_data=zdict) def process(write=True, max=None): stats = [] with h5py.File(r"html_gz.h5", "r") as rf: with h5py.File(r"html_zstd.h5", "w") as wf: rds = rf["/html_gz"] # sort of varbinary(), somewhat tricky but usable wds = wf.create_dataset('html_zstd', (0,), maxshape=(None,), dtype=h5py.special_dtype(vlen=np.uint8)) t0 = time.perf_counter() records = 0 for gz in rds: records += 1 t1 = time.perf_counter() html = gzip.decompress(gz.tobytes()) zs = zc.compress(html) zs = np.frombuffer(zs, dtype=np.uint8) hlen = len(html) glen = len(gz) zlen = len(zs) t2 = time.perf_counter() stats.append([hlen / 1024, hlen / glen, hlen / zlen, (t2 - t1) * 1000]) if write: ds_append(wds, zs) if records % 250 == 0: df = pd.DataFrame(columns=["html", "gz", "zstd", "ms"], data=stats) df["gz/zstd"] = df["zstd"] / df["gz"] df = df.agg("mean") dt = time.perf_counter() - t0 rec_per_sec = records / dt remain = rds.len() - records print(f"html: {df.loc['html']:.1f}k,", ", ".join([f"{k:}: {df.loc[k]:>.2f}" for k in ("gz", "zstd", "gz/zstd")])) print( f"{records} records processed in {dt / 60:.1f} min, {rec_per_sec:.0f} rec/s, {remain} remaining, ETA {remain / rec_per_sec / 60:.1f} min") stats = [] if max is not None: if records > max: break process() # max=5000)
def compress_content_dict_stream_writer(chunks, zparams, use_size=False): zctx = zstd.ZstdCompressor(compression_params=zparams) b = bio() with zctx.stream_writer( b, size=len(chunks[0]) if use_size else -1) as compressor: compressor.write(chunks[0]) for i, chunk in enumerate(chunks[1:]): d = zstd.ZstdCompressionDict(chunks[i]) b = bio() zctx = zstd.ZstdCompressor(dict_data=d, compression_params=zparams) with zctx.stream_writer( b, size=len(chunk) if use_size else -1) as compressor: compressor.write(chunk)
def _get_compressor(f4_file_path, compression_level): if compression_level == None: return None training_dict_file_path = CompressionHelper._get_training_dict_file_path( f4_file_path) if os.path.exists(training_dict_file_path): with open(training_dict_file_path, 'rb') as dict_file: training_dict = zstandard.ZstdCompressionDict(dict_file.read()) return zstandard.ZstdCompressor(dict_data=training_dict, level=compression_level) return zstandard.ZstdCompressor(level=compression_level)
def decompress_content_dict_stream_writer(chunks, opts): zctx = zstd.ZstdDecompressor(**opts) b = bio() with zctx.stream_writer(b) as decompressor: decompressor.write(chunks[0]) last = b.getvalue() for chunk in chunks[1:]: d = zstd.ZstdCompressionDict(last) zctx = zstd.ZstdDecompressor(dict_data=d, **opts) b = bio() with zctx.stream_writer(b) as decompressor: decompressor.write(chunk) last = b.getvalue()
def get_dictionary(filename: str) -> zstandard.ZstdCompressionDict: s = internetarchive.get_session() r = s.get('https://archive.org/download/' + filename, headers={'Range': 'bytes=0-7'}) if r.content[:4] != b'\x5D\x2A\x4D\x18': return None data_size = struct.unpack('<L', r.content[4:])[0] r = s.get('https://archive.org/download/' + filename, headers={'Range': 'bytes=8-{}'.format(8 + data_size - 1)}) dictionary = r.content if r.content[:4] == b'\x28\xB5\x2F\xFD': dictionary = zstandard.ZstdDecompressor().decompress(dictionary) if dictionary[:4] != b'\x37\xA4\x30\xEC': raise ValueError('Not a dictionary.') return zstandard.ZstdCompressionDict(dictionary)
def _get_decompressor(f4_file_path): level = CompressionHelper._get_level(f4_file_path) if level != None: training_dict_file_path = CompressionHelper._get_training_dict_file_path( f4_file_path) if os.path.exists(training_dict_file_path): with open(training_dict_file_path, "rb") as dict_file: training_dict = zstandard.ZstdCompressionDict( dict_file.read()) return zstandard.ZstdDecompressor(dict_data=training_dict) return zstandard.ZstdDecompressor() return None
def test_data_equivalence(self, original, threads, use_dict): kwargs = {} # Use a content dictionary because it is cheap to create. if use_dict: kwargs["dict_data"] = zstd.ZstdCompressionDict(original[0]) cctx = zstd.ZstdCompressor(level=1, write_checksum=True, **kwargs) result = cctx.multi_compress_to_buffer(original, threads=-1) self.assertEqual(len(result), len(original)) # The frame produced via the batch APIs may not be bit identical to that # produced by compress() because compression parameters are adjusted # from the first input in batch mode. So the only thing we can do is # verify the decompressed data matches the input. dctx = zstd.ZstdDecompressor(**kwargs) for i, frame in enumerate(result): self.assertEqual(dctx.decompress(frame), original[i])
def main(source, dest): if not os.path.exists(source): raise IOError(f'"No source database found at {source}') try: # https://sqlite-utils.datasette.io/en/stable/python-api.html db = Database(source, tracer=tracer) # If there are errors in the user defined function, this is the # only way to get the actual error and not # "user-defined function raised exception" sqlite3.enable_callback_tracebacks(True) dctx = None if db["zstd_dicts"].exists(): dict_row = db.execute( "select dict_bytes from zstd_dicts LIMIT 1").fetchone() if dict_row is None: raise Exception("No dictionary found in zstd_dicts table!") else: dict = zstd.ZstdCompressionDict(dict_row[0]) dctx = zstd.ZstdDecompressor(dict_data=dict) else: dctx = zstd.ZstdDecompressor() @db.register_function def decompress(s): return dctx.decompress(s) initialize(dest) db.attach("decompress", dest) db.execute(""" insert into decompress.entries (epoch_secs, nanos, level, content) select epoch_secs, nanos, level, decompress(content) from entries """) db.execute("COMMIT") except Exception as e: print("Unexpected error:", e)
def test_simple(self): original = [ b'foo' * 64, b'foobar' * 64, b'baz' * 64, b'foobaz' * 64, b'foobarbaz' * 64, ] chunks = [] chunks.append(zstd.ZstdCompressor().compress(original[0])) for i, chunk in enumerate(original[1:]): d = zstd.ZstdCompressionDict(original[i]) cctx = zstd.ZstdCompressor(dict_data=d) chunks.append(cctx.compress(chunk)) for i in range(1, len(original)): chain = chunks[0:i] expected = original[i - 1] dctx = zstd.ZstdDecompressor() decompressed = dctx.decompress_content_dict_chain(chain) self.assertEqual(decompressed, expected)
def _from_existing_db(db, complevel: int = 6) -> JsonBlobCache: readonly = db.flags().get("readonly") try: db_info = db.open_db(b"info", create=False) except lmdb.NotFoundError: raise ValueError("Existing database is not a ds cache") with db.begin(db_info, write=False) as tr: version = tr.get(b"version", None) if version is None: raise ValueError("Missing format version field") if version != FORMAT_VERSION: raise ValueError("Unsupported on disk version: " + version.decode("utf8")) zdict = tr.get(b"zdict", None) dbs = SimpleNamespace( main=db, info=db_info, groups=db.open_db(b"groups", create=False), ds=db.open_db(b"ds", create=False), udata=db.open_db(b"udata", create=False), ) comp_params = { "dict_data": zstandard.ZstdCompressionDict(zdict) } if zdict else {} comp = (None if readonly else zstandard.ZstdCompressor(level=complevel, **comp_params)) decomp = zstandard.ZstdDecompressor(**comp_params) state = SimpleNamespace(dbs=dbs, comp=comp, decomp=decomp) return JsonBlobCache(state)
def decompress(fexts, dictionary=None): # directory with files to be decompressed workingdir = 'downloaded_tweets' # Set up zstd decompressor if dictionary: with open(dictionary, 'r') as f: dict_data = zstd.ZstdCompressionDict(f.read()) dctx = zstd.ZstdDecompressor(dict_data=dict_data) else: dctx = zstd.ZstdDecompressor() # decompress files for filename in os.listdir(workingdir): tw_handle, ext = os.path.splitext(filename) if ext in fexts: cur_file = os.path.join(workingdir, filename) # Decompress the file f = open(cur_file, 'r') if ext == '.br': decompressed = brotli.decompress(f.read()) elif ext == '.zst': decompressed = dctx.decompress(f.read()) f.close() # Write the decompressed .json file new_file = os.path.join(workingdir, tw_handle+'.json') with open(new_file, 'w') as f: f.write(decompressed) # Delete old compressed file if settings.DECOMPRESSOR_DELETE_COMPRESSED_FILES: os.remove(cur_file)
def test_bad_mode(self): with self.assertRaisesRegex(ValueError, "invalid dictionary load mode"): zstd.ZstdCompressionDict(b"foo", dict_type=42)
def read_dec(data, start, stop): characters = '' for i in range(start, stop): characters += str(data[i]) + ' ' return characters name = '10.01.2020_19.24_dict_64_16' path = handle_path.HandlePath() path.read_dict(name) dict_data = byte_data.read(path.input_file_path) # dict_data = byte_data.read('X:\\Studia\\III rok\\VI semestr\\Inżynierka\\ThesisDataAnalysis\\output_data\\19.45_03.01.2020_dict_64_16') dictionary = zstd.ZstdCompressionDict(dict_data) print(dictionary.dict_id()) data_size = os.path.getsize(path.input_file_path) print(data_size) print('Dict length: ', len(dict_data)) magic_number = read(dict_data, 0, 4) print(magic_number) magic_number_dec = read_dec(dict_data, 0, 4) print(magic_number_dec) dict_id = read(dict_data, 4, 8) print(dict_id) rest = read(dict_data, 4, 149) print(rest) dec = read_dec(dict_data, 4, 149)
x = Khaki() return x.dumps(*args, **kwargs) @staticmethod def loads(*args, **kwargs): x = Khaki() return x.loads(*args, **kwargs) if DIRECT_TRAINED_ZSTD: ZSTD_TRAINING_POOL = cachedDownload(TRAINING_DATA_URL) + cachedDownload( ALT_TRAINING_DATA_URL) TRAINED_DICTIONARIES = {} for level in range(1, 23): x = zstd.ZstdCompressionDict(ZSTD_TRAINING_POOL[:8388608], dict_type=zstd.DICT_TYPE_RAWCONTENT) x.precompute_compress(level=level) TRAINED_DICTIONARIES[level] = x class ZStandardTrainedSimpleInterface(object): @staticmethod def compress(data: bytes, level: int = 1) -> bytes: x = zstd.ZstdCompressor(level=level, threads=cpu_count(), dict_data=TRAINED_DICTIONARIES[level]) return level.to_bytes(1, byteorder=BYTE_ORDER) + x.compress(data) @staticmethod def decompress(data: bytes) -> bytes: x = zstd.ZstdDecompressor(dict_data=TRAINED_DICTIONARIES[ int.from_bytes(data[:1], byteorder=BYTE_ORDER)])
compressed_size = sum(map(len, compressed_stream)) ratio = float(compressed_size) / float(orig_size) * 100.0 print("stream compressed size (l=%d): %d (%.2f%%)" % (zparams.compression_level, compressed_size, ratio)) if args.content_dict: compressed_content_dict = [] ratios = [] # First chunk is compressed like normal. c = zstd.ZstdCompressor(compression_params=zparams).compress(chunks[0]) compressed_content_dict.append(c) ratios.append(float(len(c)) / float(len(chunks[0]))) # Subsequent chunks use previous chunk as a dict. for i, chunk in enumerate(chunks[1:]): d = zstd.ZstdCompressionDict(chunks[i]) zctx = zstd.ZstdCompressor(dict_data=d, compression_params=zparams) c = zctx.compress(chunk) compressed_content_dict.append(c) ratios.append(float(len(c)) / float(len(chunk))) compressed_size = sum(map(len, compressed_content_dict)) ratio = float(compressed_size) / float(orig_size) * 100.0 bad_count = sum(1 for r in ratios if r >= 1.00) good_ratio = 100.0 - (float(bad_count) / float(len(chunks)) * 100.0) print( "content dict compressed size (l=%d): %d (%.2f%%); smaller: %.2f%%" % (zparams.compression_level, compressed_size, ratio, good_ratio)) print("")
def test_bad_mode(self): with self.assertRaisesRegexp(ValueError, 'invalid dictionary load mode'): zstd.ZstdCompressionDict(b'foo', dict_type=42)