def zstdlines(filename, encoding="utf-8", bufsize=65536): """ Generator over lines from a zstd compressed file. >>> for line in zstdlines("file.zst"): ... print(line) """ with open(filename, "rb") as f: decomp = ZstdDecompressor() with decomp.stream_reader(f) as reader: prev_line = "" while True: chunk = reader.read(bufsize) if not chunk: break while True: # We start with bytes but want unicode, which might not # align; so we jitter around the end to complete the # codepoint. try: string_data = chunk.decode(encoding) except UnicodeDecodeError: chunk = chunk + reader.read(1) else: break lines = string_data.split("\n") for i, line in enumerate(lines[:-1]): if i == 0: line = prev_line + line yield line prev_line = lines[-1]
def run(judge_class: Callable[[], JudgeDriver], task: JudgeTask) -> JudgeStatus: LOGGER.info('judge start (contest_id: {}, problem_id: {}, ' 'submission_id: {}, user_id: {}'.format( task.contest_id, task.problem_id, task.id, task.user_id)) zctx = ZstdDecompressor() try: task.code = zctx.decompress(task.code) for test in task.tests: test.input = zctx.decompress(test.input) test.output = zctx.decompress(test.output) except Exception: LOGGER.warning('decompress failed', exc_info=True) with transaction() as s: return _update_submission_status(s, task, JudgeStatus.InternalError) with judge_class() as judge: ret = _prepare(judge, task) if ret: return ret if task.compile_image_name: ret = _compile(judge, task) if ret: return ret ret = _tests(judge, task) LOGGER.info('judge finished (submission_id={}): {}'.format(task.id, ret)) return ret
def __enter__(self): codec = detect_compression(self.file) # (1) Open the input file # We keep a reference (`f`) to the original file, # in order to be able to close it on exit. # Calling `close()` on `ZstdDecompressor` does not # close the underlying resource. self.f = self.file.open("rb") # UTF-8 decoded, decompressed, file self.fb = self.f # (2) Setup the decompressor, if needed if codec == CompressionFormat.Zstandard: dict_data = ZstdCompressionDict(dictionary.read_bytes()) ctx = ZstdDecompressor(dict_data=dict_data) self.fb = ctx.stream_reader(self.fb, read_across_frames=True) # (3) Decode the file self.fb = TextIOWrapper(self.fb, "utf-8") # (4) Deserialize the records stream = map(json_tryloads, self.fb) # (5) Apply the filters stream = filter( lambda record: all(fn.keep(record) for fn in self.filters), stream) # (6) Apply the transformers for fn in self.transformers: stream = map(fn, stream) return stream
def test_log(tmpfile): records = [{ "msm_id": 1234, "prb_id": 5678 }, { "msm_id": 9876, "prb_id": 5432 }] with AtlasRecordsWriter(tmpfile, compression=True, log=True) as w: log_file = w.log_file for record in records: w.write(record) # TODO: Methods to simplify log reading? # Zstandard decompression context dict_data = ZstdCompressionDict(dictionary.read_bytes()) ctx = ZstdDecompressor(dict_data=dict_data) f = tmpfile.open("rb") log_f = log_file.open("rb") log = LogEntry.iter_unpack(log_f.read()) for i, (size, msm_id, prb_id) in enumerate(log): rec = json.loads(ctx.decompress(f.read(size)).decode("utf-8")) assert rec == records[i] assert msm_id == records[i]["msm_id"] assert prb_id == records[i]["prb_id"] f.close() log_f.close()
def __decompress_zst_file(self, file, with_extension): with open(file, 'rb') as compressed: decomp = ZstdDecompressor() filename = os.path.splitext(file)[0] file_name = f"{self.destination_folder}/{file.name}{with_extension}" with open(file_name, 'wb') as destination: decomp.copy_stream(compressed, destination) print_green(f"unpacked zst file completed to {file_name}")
def __decompressNcz(nspf, f): ncaHeaderSize = 0x4000 blockID = 0 nspf.seek(0) header = nspf.read(ncaHeaderSize) start = f.tell() magic = nspf.read(8) if not magic == b'NCZSECTN': raise ValueError("No NCZSECTN found! Is this really a .ncz file?") sectionCount = readInt64(nspf) sections = [Section(nspf) for _ in range(sectionCount)] nca_size = ncaHeaderSize for i in range(sectionCount): nca_size += sections[i].size pos = nspf.tell() blockMagic = nspf.read(8) nspf.seek(pos) useBlockCompression = blockMagic == b'NCZBLOCK' blockSize = -1 if useBlockCompression: BlockHeader = Block(nspf) blockDecompressorReader = BlockDecompressorReader.BlockDecompressorReader( nspf, BlockHeader) pos = nspf.tell() if not useBlockCompression: decompressor = ZstdDecompressor().stream_reader(nspf) hash = sha256() f.write(header) hash.update(header) for s in sections: i = s.offset crypto = AESCTR(s.cryptoKey, s.cryptoCounter) end = s.offset + s.size while i < end: crypto.seek(i) chunkSz = 0x10000 if end - i > 0x10000 else end - i if useBlockCompression: inputChunk = blockDecompressorReader.read(chunkSz) else: inputChunk = decompressor.read(chunkSz) if not len(inputChunk): break if not useBlockCompression: decompressor.flush() if s.cryptoType in (3, 4): inputChunk = crypto.encrypt(inputChunk) f.write(inputChunk) hash.update(inputChunk) i += len(inputChunk) hexHash = hash.hexdigest() end = f.tell() written = (end - start) return (written, hexHash)
def decompress(self, fobj: IO[bytes]) -> IO[bytes]: decompressor = ZstdDecompressor() outfobj = NamedTemporaryFile(delete=False) try: decompressor.copy_stream(fobj, outfobj) outfobj.seek(0) yield outfobj finally: outfobj.close() remove(outfobj.name)
def open_read(self, path: str) -> IO[bytes]: decompressor = ZstdDecompressor() outfobj = NamedTemporaryFile(delete=False) try: with open(path, 'rb') as infobj: decompressor.copy_stream(infobj, outfobj) outfobj.seek(0) yield outfobj finally: outfobj.close() remove(outfobj.name)
def _get_test_data(contest_id: str, problem_id: str, test_id: str, is_input: bool) -> Response: zctx = ZstdDecompressor() from io import BytesIO with transaction() as s: _ = _validate_token(s, admin_required=True) tc = s.query(TestCase).filter(TestCase.contest_id == contest_id, TestCase.problem_id == problem_id, TestCase.id == test_id).first() if not tc: abort(404) f = BytesIO(zctx.decompress(tc.input if is_input else tc.output)) return send_file(f, as_attachment=True, attachment_filename='{}.{}'.format( test_id, 'in' if is_input else 'out'))
class TinyIndexBase: def __init__(self, item_type: type, num_pages: int, page_size: int): self.item_type = item_type self.num_pages = num_pages self.page_size = page_size self.decompressor = ZstdDecompressor() self.mmap = None def retrieve(self, key: str): index = self._get_key_page_index(key) page = self.get_page(index) if page is None: return [] print("REtrieve", self.index_path, page) return self.convert_items(page) def _get_key_page_index(self, key): key_hash = mmh3.hash(key, signed=False) return key_hash % self.num_pages def get_page(self, i): """ Get the page at index i, decompress and deserialise it using JSON """ page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size] try: decompressed_data = self.decompressor.decompress(page_data) except ZstdError: return None return json.loads(decompressed_data.decode('utf8')) def convert_items(self, items): converted = [self.item_type(*item) for item in items] # print("Converted", items, converted) return converted
def from_file(cls, file): data = [] # TODO: Unified "file loader" # (detect_codec is used is many place) codec = detect_compression(file) with open(file, "rb") as f: if codec == CompressionFormat.Zstandard: ctx = ZstdDecompressor() f = ctx.stream_reader(f) f = TextIOWrapper(f, "utf-8") for line in f: if line.startswith(";"): continue prefix, origins = line.split("\t") origins = [int(x) for x in origins.split(",")] data.append((prefix, origins)) return cls(data)
def __init__(self, item_type: type, index_path: str, num_pages: int, page_size: int): super().__init__(item_type, num_pages, page_size) self.index_path = index_path self.compressor = ZstdCompressor() self.decompressor = ZstdDecompressor() self.index_file = None self.mmap = None
def zopen(fn, mode="r", *args, **kwargs): import codecs objs = None if fn.endswith(".gz"): import gzip objs = [gzip.open(fn, "rb")] elif fn.endswith(".bz2"): import bz2 objs = [bz2.open(fn, "rb")] elif fn.endswith(".xz"): import lzma objs = [lzma.open(fn, "rb")] elif fn.endswith(".zst"): from zstandard import ZstdDecompressor try: # documentation says KiB but it seems to be bytes ctx = ZstdDecompressor(max_window_size=1024 * 1024 * 1024 * 2) except: # fallback in case that changes ctx = ZstdDecompressor(max_window_size=1024 * 1024 * 2) f1 = open(fn, "rb", 512 * 1024) f2 = ctx.stream_reader(f1) objs = [f2, f1] else: objs = [open(fn, "rb", 512 * 1024)] if "b" not in mode: enc = kwargs.get("encoding", "utf-8") # yield io.TextIOWrapper(io.BufferedReader(f2)) yield codecs.getreader(enc)(objs[0]) else: yield objs[0] for obj in objs: obj.close()
def open_compressed(path): if is_gzip(path): return gzip.open(path) if is_zstd(path): from zstandard import ZstdDecompressor return ZstdDecompressor().stream_reader(open(path, 'rb')) return open(path, 'rb')
class ZstdJsonSerializer(Serializer): def __init__(self): self.compressor = ZstdCompressor() self.decompressor = ZstdDecompressor() def serialize(self, item) -> bytes: return self.compressor.compress(json.dumps(item).encode('utf8')) def deserialize(self, serialized_item: bytes): return json.loads( self.decompressor.decompress(serialized_item).decode('utf8'))
def __decompressBlock(self, blockID): if (blockID >= len(self.CompressedBlockOffsetList)): raise EOFError( "BlockID exceeds the amounts of compressed blocks in that file!" ) self.nspf.seek(self.CompressedBlockOffsetList[blockID]) if self.CompressedBlockSizeList[blockID] < self.BlockSize: return ZstdDecompressor().decompress(self.nspf.read( self.BlockSize)) else: return self.nspf.read(self.BlockSize)
def get_submission(contest_id: str, submission_id: str) -> Response: params, _ = _validate_request() zctx = ZstdDecompressor() with transaction() as s: u = _validate_token(s) contest = s.query(Contest).filter(Contest.id == contest_id).first() if not (contest and contest.is_accessible(u)): abort(404) tmp = s.query(Submission, User.name).filter(Submission.contest_id == contest_id, Submission.id == submission_id, Submission.user_id == User.id).first() if not tmp: abort(404) submission, user_name = tmp if not submission.is_accessible(contest, u): abort(404) ret = submission.to_dict() ret['user_name'] = user_name ret['tests'] = [] for t_raw in s.query(JudgeResult).filter( JudgeResult.submission_id == submission_id).order_by( JudgeResult.status, JudgeResult.test_id): t = t_raw.to_dict() # 不要な情報を削除 t.pop('contest_id') t.pop('problem_id') t.pop('submission_id') t['id'] = t['test_id'] t.pop('test_id') if not (contest.is_finished() or (u and u['admin'])): # コンテスト中&非管理者の場合は # 実行時間とメモリ消費量を返却しない # (NULLの場合はto_dictで設定されないのでpopの引数にNoneを指定) t.pop('time', None) t.pop('memory', None) ret['tests'].append(t) ret['code'] = zctx.decompress(ret['code']).decode('utf-8') return jsonify(ret)
def _read_rel_zs_rows(filepath, chunk_size=8 * 1000 * 1000): from zstandard import ZstdDecompressor with open(filepath, "rb") as fh: ctx = ZstdDecompressor() with ctx.stream_reader(fh) as reader: over = False chunks = [] rows = [] while not over: have_row = False while not have_row: chunk = reader.read(chunk_size) if not chunk: over = True break if b"\n" in chunk: have_row = True chunks.append(chunk) (new_rows, semi_row) = _consume_rows(chunks) rows += new_rows chunks = [semi_row] return rows
def decompress(data, compressor_id): if compressor_id == SnappyContext.compressor_id: # python-snappy doesn't support the buffer interface. # https://github.com/andrix/python-snappy/issues/65 # This only matters when data is a memoryview since # id(bytes(data)) == id(data) when data is a bytes. return snappy.uncompress(bytes(data)) elif compressor_id == ZlibContext.compressor_id: return zlib.decompress(data) elif compressor_id == ZstdContext.compressor_id: # ZstdDecompressor is not thread safe. # TODO: Use a pool? return ZstdDecompressor().decompress(data) else: raise ValueError("Unknown compressorId %d" % (compressor_id, ))
def __decompressBlock(self, blockID): if self.CurrentBlockId == blockID: return self.CurrentBlock decompressedBlockSize = self.BlockSize if blockID >= len(self.CompressedBlockOffsetList) - 1: if blockID >= len(self.CompressedBlockOffsetList): raise EOFError( "BlockID exceeds the amounts of compressed blocks in that file!" ) decompressedBlockSize = self.BlockHeader.decompressedSize % BlockSize self.nspf.seek(self.CompressedBlockOffsetList[blockID]) if self.CompressedBlockSizeList[blockID] < decompressedBlockSize: self.CurrentBlock = ZstdDecompressor().decompress( self.nspf.read(decompressedBlockSize)) else: self.CurrentBlock = self.nspf.read(decompressedBlockSize) self.CurrentBlockId = blockID return self.CurrentBlock
def decompress(data, compressor_id): if compressor_id == SnappyContext.compressor_id: # python-snappy doesn't support the buffer interface. # https://github.com/andrix/python-snappy/issues/65 # This only matters when data is a memoryview since # id(bytes(data)) == id(data) when data is a bytes. # NOTE: bytes(memoryview) returns the memoryview repr # in Python 2.7. The right thing to do in 2.7 is call # memoryview.tobytes(), but we currently only use # memoryview in Python 3.x. return snappy.uncompress(bytes(data)) elif compressor_id == ZlibContext.compressor_id: return zlib.decompress(data) elif compressor_id == ZstdContext.compressor_id: # ZstdDecompressor is not thread safe. # TODO: Use a pool? return ZstdDecompressor().decompress(data) else: raise ValueError("Unknown compressorId %d" % (compressor_id, ))
def __init__( self, path: Path, *, encoding: str, warn_uncompressed: bool = True, progress_bar: bool = False, progress_bar_desc: Optional[str] = None, ): self.path = path self._fp = path.open("rb") self._fin: BinaryIO if path.suffix == ".gz": self._fin = cast(BinaryIO, GzipFile(fileobj=self._fp)) elif path.suffix == ".bz2": self._fin = cast(BinaryIO, BZ2File(self._fp)) elif path.suffix == ".xz": self._fin = cast(BinaryIO, LZMAFile(self._fp)) elif path.suffix == ".zst": self._fin = cast(BinaryIO, ZstdDecompressor().stream_reader(self._fp)) else: if warn_uncompressed: # pragma: no cover _LOGGER.warning( "Could not detect compression type of file '{}' from its " "extension, treating as uncompressed file.", path, ) self._fin = self._fp self._progress_bar: Optional[tqdm[None]] = None if progress_bar: self._progress_bar = tqdm( desc=progress_bar_desc or self.path.name, total=self.size(), unit="B", unit_scale=True, unit_divisor=1024, dynamic_ncols=True, ) super().__init__(self._fin, encoding=encoding)
def __decompressNcz(nspf, f, statusReportInfo, pleaseNoPrint): ncaHeaderSize = 0x4000 blockID = 0 nspf.seek(0) header = nspf.read(ncaHeaderSize) if f != None: start = f.tell() magic = nspf.read(8) if not magic == b'NCZSECTN': raise ValueError("No NCZSECTN found! Is this really a .ncz file?") sectionCount = nspf.readInt64() sections = [Header.Section(nspf) for _ in range(sectionCount)] nca_size = ncaHeaderSize for i in range(sectionCount): nca_size += sections[i].size pos = nspf.tell() blockMagic = nspf.read(8) nspf.seek(pos) useBlockCompression = blockMagic == b'NCZBLOCK' blockSize = -1 if useBlockCompression: BlockHeader = Header.Block(nspf) blockDecompressorReader = BlockDecompressorReader.BlockDecompressorReader( nspf, BlockHeader) pos = nspf.tell() if not useBlockCompression: decompressor = ZstdDecompressor().stream_reader(nspf) hash = sha256() if statusReportInfo == None: BAR_FMT = u'{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} {unit} [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]' bar = enlighten.Counter(total=nca_size // 1048576, desc='Decompress', unit="MiB", color='red', bar_format=BAR_FMT) decompressedBytes = len(header) if f != None: f.write(header) if statusReportInfo != None: statusReport, id = statusReportInfo statusReport[id] = [len(header), 0, nca_size] else: bar.count = decompressedBytes // 1048576 bar.refresh() hash.update(header) for s in sections: i = s.offset crypto = aes128.AESCTR(s.cryptoKey, s.cryptoCounter) end = s.offset + s.size while i < end: crypto.seek(i) chunkSz = 0x10000 if end - i > 0x10000 else end - i if useBlockCompression: inputChunk = blockDecompressorReader.read(chunkSz) else: inputChunk = decompressor.read(chunkSz) if not len(inputChunk): break if not useBlockCompression: decompressor.flush() if s.cryptoType in (3, 4): inputChunk = crypto.encrypt(inputChunk) if f != None: f.write(inputChunk) hash.update(inputChunk) lenInputChunk = len(inputChunk) i += lenInputChunk decompressedBytes += lenInputChunk if statusReportInfo != None: statusReport[id] = [ statusReport[id][0] + chunkSz, statusReport[id][1], nca_size ] else: bar.count = decompressedBytes // 1048576 bar.refresh() bar.close() hexHash = hash.hexdigest() if f != None: end = f.tell() written = (end - start) return (written, hexHash) return (0, hexHash)
def __init__(self): self.compressor = ZstdCompressor() self.decompressor = ZstdDecompressor()
def __decompressNcz(nspf, f, statusReportInfo, pleaseNoPrint): UNCOMPRESSABLE_HEADER_SIZE = 0x4000 blockID = 0 nspf.seek(0) header = nspf.read(UNCOMPRESSABLE_HEADER_SIZE) if f != None: start = f.tell() magic = nspf.read(8) if not magic == b'NCZSECTN': raise ValueError("No NCZSECTN found! Is this really a .ncz file?") sectionCount = nspf.readInt64() sections = [Header.Section(nspf) for _ in range(sectionCount)] if sections[0].offset - UNCOMPRESSABLE_HEADER_SIZE > 0: fakeSection = Header.FakeSection( UNCOMPRESSABLE_HEADER_SIZE, sections[0].offset - UNCOMPRESSABLE_HEADER_SIZE) sections.insert(0, fakeSection) nca_size = UNCOMPRESSABLE_HEADER_SIZE for i in range(sectionCount): nca_size += sections[i].size pos = nspf.tell() blockMagic = nspf.read(8) nspf.seek(pos) useBlockCompression = blockMagic == b'NCZBLOCK' blockSize = -1 if useBlockCompression: BlockHeader = Header.Block(nspf) blockDecompressorReader = BlockDecompressorReader.BlockDecompressorReader( nspf, BlockHeader) pos = nspf.tell() if not useBlockCompression: decompressor = ZstdDecompressor().stream_reader(nspf) hash = sha256() if statusReportInfo == None: BAR_FMT = u'{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} {unit} [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]' bar = enlighten.Counter(total=nca_size // 1048576, desc='Decompress', unit="MiB", color='red', bar_format=BAR_FMT) decompressedBytes = len(header) if f != None: f.write(header) if statusReportInfo != None: statusReport, id = statusReportInfo statusReport[id] = [len(header), 0, nca_size] else: bar.count = decompressedBytes // 1048576 bar.refresh() hash.update(header) firstSection = True for s in sections: i = s.offset useCrypto = s.cryptoType in (3, 4) if useCrypto: crypto = aes128.AESCTR(s.cryptoKey, s.cryptoCounter) end = s.offset + s.size if firstSection: firstSection = False uncompressedSize = UNCOMPRESSABLE_HEADER_SIZE - sections[0].offset if uncompressedSize > 0: i += uncompressedSize while i < end: if useCrypto: crypto.seek(i) chunkSz = 0x10000 if end - i > 0x10000 else end - i if useBlockCompression: inputChunk = blockDecompressorReader.read(chunkSz) else: inputChunk = decompressor.read(chunkSz) decompressor.flush() if not len(inputChunk): break if useCrypto: inputChunk = crypto.encrypt(inputChunk) if f != None: f.write(inputChunk) hash.update(inputChunk) lenInputChunk = len(inputChunk) i += lenInputChunk decompressedBytes += lenInputChunk if statusReportInfo != None: statusReport[id] = [ statusReport[id][0] + chunkSz, statusReport[id][1], nca_size ] else: bar.count = decompressedBytes // 1048576 bar.refresh() if statusReportInfo == None: bar.close() #Line break after closing the process bar is required to prevent #the next output from being on the same line as the process bar print() hexHash = hash.hexdigest() if f != None: end = f.tell() written = (end - start) return (written, hexHash) return (0, hexHash)
def __init__(self, item_type: type, num_pages: int, page_size: int): self.item_type = item_type self.num_pages = num_pages self.page_size = page_size self.decompressor = ZstdDecompressor() self.mmap = None
def read_index(index_path: Path, rsa_priv_key_path: Path = None) -> dict: if index_path is None or not index_path.is_file(): raise RuntimeError( f"Unable to read non-existant index file \"{index_path}\"") encryption_flag = None compression_flag = None session_key = None data_size = None to_read_buffer = None with open(index_path, "rb") as index_stream: magic = str(index_stream.read(7)) if magic != "TINFOIL": raise RuntimeError( "Invalid tinfoil index magic.\n\nExpected Magic = " + f"\"TINFOIL\"\nMagic in index file = \"{magic}\"") flags = index_stream.read(1)[0] encryption_flag = flags & 0xF0 key_available = rsa_priv_key_path is not None and \ rsa_priv_key_path.is_file() if encryption_flag == EncryptionFlag.ENCRYPT and not key_available: raise RuntimeError( "Unable to decrypt encrypted index without private key.") compression_flag = flags & 0x0F if compression_flag not in CompressionFlag: raise RuntimeError( "Unimplemented compression method encountered while reading " + "index header.") session_key = index_stream.read(0x100) data_size = int.from_bytes(index_stream.read(8), byteorder="little") to_read_buffer = index_stream.read() if encryption_flag == EncryptionFlag.ENCRYPT: rsa_priv_key = import_rsa_key(open(rsa_priv_key_path).read()) pkcs1_oaep_ctx = new_pkcs1_oaep_ctx(rsa_priv_key, hashAlgo=SHA256, label=b"") aes_key = pkcs1_oaep_ctx.decrypt(session_key) aes_ctx = new_aes_ctx(aes_key, MODE_ECB) to_read_buffer = aes_ctx.decrypt(to_read_buffer) if compression_flag == CompressionFlag.ZSTD_COMPRESSION: to_read_buffer = ZstdDecompressor().decompress( to_read_buffer[:data_size]) elif compression_flag == CompressionFlag.ZLIB_COMPRESSION: to_read_buffer = zlib_decompress(to_read_buffer[:data_size]) elif compression_flag == CompressionFlag.NO_COMPRESSION: to_read_buffer = to_read_buffer[:data_size] try: return json_deserialize(to_read_buffer) except JSONDecodeError: raise RuntimeError("Unable to deserialize index data.")
blockMagic = f.read(8) f.seek(pos) useBlockCompression = blockMagic == b'NCZBLOCK' if useBlockCompression: BlockHeader = Block(f) if BlockHeader.blockSizeExponent < 14 or BlockHeader.blockSizeExponent > 32: raise ValueError( "Corrupted NCZBLOCK header: Block size must be between 14 and 32" ) blockSize = 2**BlockHeader.blockSizeExponent pos = f.tell() with open(argv[2], 'wb+') as o: o.write(header) decompressedBytes = 0 blockID = 0 dctx = ZstdDecompressor() if not useBlockCompression: decompressor = dctx.stream_reader(f) while True: if useBlockCompression: if BlockHeader.compressedBlockSizeList[blockID] < blockSize: decompressor = dctx.stream_reader(f) inputChunk = decompressor.read(blockSize) decompressedBytes += len(inputChunk) o.write(inputChunk) decompressor.flush() o.flush() print( 'Block', str(blockID + 1) + '/' + str(BlockHeader.numberOfBlocks))
def __decompressNcz(nspf, f, statusReportInfo): UNCOMPRESSABLE_HEADER_SIZE = 0x4000 blockID = 0 nspf.seek(0) header = nspf.read(UNCOMPRESSABLE_HEADER_SIZE) if f is not None: start = f.tell() magic = nspf.read(8) if not magic == b'NCZSECTN': raise ValueError("No NCZSECTN found! Is this really a .ncz file?") sectionCount = nspf.readInt64() sections = [Section(nspf) for _ in range(sectionCount)] if sections[0].offset - UNCOMPRESSABLE_HEADER_SIZE > 0: fakeSection = FakeSection( UNCOMPRESSABLE_HEADER_SIZE, sections[0].offset - UNCOMPRESSABLE_HEADER_SIZE) sections.insert(0, fakeSection) nca_size = UNCOMPRESSABLE_HEADER_SIZE for i in range(sectionCount): nca_size += sections[i].size decompressor = ZstdDecompressor().stream_reader(nspf) hash = sha256() bar = Status.create(nspf.size, desc=os.path.basename(nspf._path), unit='B') # if statusReportInfo == None: # BAR_FMT = u'{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} {unit} [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]' # bar = enlighten.Counter(total=nca_size//1048576, desc='Decompress', unit="MiB", color='red', bar_format=BAR_FMT) decompressedBytes = len(header) if f is not None: f.write(header) bar.add(len(header)) hash.update(header) firstSection = True for s in sections: i = s.offset useCrypto = s.cryptoType in (3, 4) if useCrypto: crypto = aes128.AESCTR(s.cryptoKey, s.cryptoCounter) end = s.offset + s.size if firstSection: firstSection = False uncompressedSize = UNCOMPRESSABLE_HEADER_SIZE - sections[0].offset if uncompressedSize > 0: i += uncompressedSize while i < end: if useCrypto: crypto.seek(i) chunkSz = 0x10000 if end - i > 0x10000 else end - i inputChunk = decompressor.read(chunkSz) decompressor.flush() if not len(inputChunk): break if useCrypto: inputChunk = crypto.encrypt(inputChunk) if f is not None: f.write(inputChunk) bar.add(len(inputChunk)) hash.update(inputChunk) lenInputChunk = len(inputChunk) i += lenInputChunk decompressedBytes += lenInputChunk bar.add(lenInputChunk) bar.close() print() hexHash = hash.hexdigest() if f is not None: end = f.tell() written = (end - start) return (written, hexHash) return (0, hexHash)
else: if not sent: continue if para_id == '0': continue out += ' '.join(tokenizer.tokenize(sent)) + '\n' sent = '' if sent: out += ' '.join(tokenizer.tokenize(sent)) + '\n' return out dctx = ZstdDecompressor() for group in tqdm(os.listdir(source_dir), ncols=80, position=0): group_dir = os.path.join(source_dir, group) group_outs = [] for filename in tqdm(os.listdir(group_dir), ncols=80, desc=group, position=1): filepath = os.path.join(group_dir, filename) with open(filepath, 'rb') as f: with dctx.stream_reader(f) as r: text_stream = io.TextIOWrapper(r, encoding='utf-8') out = convert_file(text_stream) group_outs.append(out) out = '\n'.join(group_outs)