def decompress_via_writer(data): buffer = io.BytesIO() dctx = zstd.ZstdDecompressor() with dctx.stream_writer(buffer) as decompressor: decompressor.write(data) return buffer.getvalue()
def extract_tar_zst(path): dctx = zstandard.ZstdDecompressor() with open(f"{path}.zst", "rb") as f: with dctx.stream_reader(f) as reader: with tarfile.open(mode="r|", fileobj=reader) as tar: tar.extractall()
def test_trigger_from_preexisting(monkeypatch, tmpdir, mock_secrets, mock_taskcluster, mock_phabricator, fake_hg_repo): tmp_path = tmpdir.strpath hg, local, remote = fake_hg_repo add_file(hg, local, "file", "1\n2\n3\n4\n") commit(hg, 1) add_file(hg, local, "file", "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n") revision2 = commit(hg, 2) hg.push(dest=bytes(remote, "ascii")) add_file(hg, local, "file2", "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n") revision3 = commit(hg, 2) hg.push(dest=bytes(remote, "ascii")) add_file(hg, local, "file3", "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n") revision4 = commit(hg, 2) hg.push(dest=bytes(remote, "ascii")) responses.add( responses.HEAD, "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/project.relman.code-coverage.production.cron.latest/artifacts/public/triggered_revisions.zst", status=200, ) responses.add( responses.GET, "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/project.relman.code-coverage.production.cron.latest/artifacts/public/triggered_revisions.zst", status=200, body=zstandard.ZstdCompressor().compress( f"{revision2}\n{revision3}".encode("ascii")), ) copy_pushlog_database(remote, local) myBucket = {} def get_bucket(acc): return myBucket monkeypatch.setattr(trigger_missing, "get_bucket", get_bucket) gcp_covdir_exists_calls = 0 def gcp_covdir_exists(bucket, repository, revision, platform, suite): nonlocal gcp_covdir_exists_calls gcp_covdir_exists_calls += 1 assert bucket == myBucket assert repository == "mozilla-central" assert platform == "all" assert suite == "all" return revision == revision3 monkeypatch.setattr(uploader, "gcp_covdir_exists", gcp_covdir_exists) def slugId(): return "myGroupId" monkeypatch.setattr(trigger_missing, "slugId", slugId) trigger_hook_calls = 0 def get_service(serv): assert serv == "hooks" class HooksService: def triggerHook(self, hook_group, hook_id, payload): nonlocal trigger_hook_calls assert hook_group == "project-relman" assert hook_id == "code-coverage-repo-production" assert payload == { "REPOSITORY": "https://hg.mozilla.org/mozilla-central", "REVISION": revision4, "taskGroupId": "myGroupId", "taskName": f"covdir for {revision4}", } trigger_hook_calls += 1 return HooksService() monkeypatch.setattr(taskcluster_config, "get_service", get_service) get_decision_task_calls = 0 def get_decision_task(branch, revision): nonlocal get_decision_task_calls assert branch == "mozilla-central" assert revision == revision4 get_decision_task_calls += 1 return f"decisionTask-{revision}" monkeypatch.setattr(taskcluster, "get_decision_task", get_decision_task) get_task_details_calls = 0 def get_task_details(decision_task_id): nonlocal get_task_details_calls assert decision_task_id == f"decisionTask-{revision4}" get_task_details_calls += 1 return {"taskGroupId": f"decisionTaskGroup-{revision4}"} monkeypatch.setattr(taskcluster, "get_task_details", get_task_details) get_tasks_in_group_calls = 0 def get_tasks_in_group(group_id): nonlocal get_tasks_in_group_calls assert group_id == f"decisionTaskGroup-{revision4}" get_tasks_in_group_calls += 1 return [{ "status": { "state": "completed", }, "task": { "metadata": { "name": "build-linux64-ccov/opt", } }, }] monkeypatch.setattr(taskcluster, "get_tasks_in_group", get_tasks_in_group) with hgmo.HGMO(repo_dir=local) as hgmo_server: trigger_missing.trigger_missing(hgmo_server.server_address, out_dir=tmp_path) assert gcp_covdir_exists_calls == 1 assert trigger_hook_calls == 1 assert get_decision_task_calls == 1 assert get_task_details_calls == 1 assert get_tasks_in_group_calls == 1 dctx = zstandard.ZstdDecompressor() with open(os.path.join(tmp_path, "triggered_revisions.zst"), "rb") as zf: with dctx.stream_reader(zf) as reader: with io.TextIOWrapper(reader, encoding="ascii") as f: result = set(rev for rev in f.read().splitlines()) assert result == {revision2, revision3, revision4}
def test_io_api(self): buffer = io.BytesIO() dctx = zstd.ZstdDecompressor() writer = dctx.stream_writer(buffer) self.assertFalse(writer.closed) self.assertFalse(writer.isatty()) self.assertFalse(writer.readable()) with self.assertRaises(io.UnsupportedOperation): writer.__iter__() with self.assertRaises(io.UnsupportedOperation): writer.__next__() with self.assertRaises(io.UnsupportedOperation): writer.readline() with self.assertRaises(io.UnsupportedOperation): writer.readline(42) with self.assertRaises(io.UnsupportedOperation): writer.readline(size=42) with self.assertRaises(io.UnsupportedOperation): writer.readlines() with self.assertRaises(io.UnsupportedOperation): writer.readlines(42) with self.assertRaises(io.UnsupportedOperation): writer.readlines(hint=42) with self.assertRaises(io.UnsupportedOperation): writer.seek(0) with self.assertRaises(io.UnsupportedOperation): writer.seek(10, os.SEEK_SET) self.assertFalse(writer.seekable()) with self.assertRaises(io.UnsupportedOperation): writer.tell() with self.assertRaises(io.UnsupportedOperation): writer.truncate() with self.assertRaises(io.UnsupportedOperation): writer.truncate(42) with self.assertRaises(io.UnsupportedOperation): writer.truncate(size=42) self.assertTrue(writer.writable()) with self.assertRaises(io.UnsupportedOperation): writer.writelines([]) with self.assertRaises(io.UnsupportedOperation): writer.read() with self.assertRaises(io.UnsupportedOperation): writer.read(42) with self.assertRaises(io.UnsupportedOperation): writer.read(size=42) with self.assertRaises(io.UnsupportedOperation): writer.readall() with self.assertRaises(io.UnsupportedOperation): writer.readinto(None) with self.assertRaises(io.UnsupportedOperation): writer.fileno()
def main(): args = get_args() file_paths = [] if os.path.isfile(args.input_path): file_paths.append(args.input_path) else: for root, _, fs in os.walk(args.input_path): for f in fs: file_paths.append(os.path.join(root, f)) convert = Converter(args) # Try tokenizer is availiable sample_tokenizer = getattr(tfs, args.tokenizer_name).from_pretrained( args.model_name) if sample_tokenizer.vocab_size < 2**16 - 1: save_dtype = np.uint16 else: save_dtype = np.int32 pool = multiprocessing.Pool(args.workers, initializer=convert.initializer) # We use BytesIO to store the ids. token_ids_stream = io.BytesIO() sentlens_stream = io.BytesIO() # # Cumsum on tokens num # sent_cumsum_stream = io.BytesIO() # sent_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True)) # Cunsum on document on every sentence num, type=np.int64 doc_cumsum_stream = io.BytesIO() doc_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True)) sent_count = 0 # token_count = 0 file_paths.sort() step = 0 total_bytes_processed = 0 startup_start = time.time() for file_path in tqdm(file_paths): if file_path.endswith(".zst"): import zstandard cctx = zstandard.ZstdDecompressor() fh = open(file_path, 'rb') text = io.BufferedReader(cctx.stream_reader(fh)) elif file_path.endswith(".jsonl"): text = open(file_path, 'r', encoding='utf-8') else: print("Unexpected data format, skiped %s" % file_path) continue encoded_docs = pool.imap(convert.encode, text, 256) print("Processing %s" % file_path) for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1): step += 1 total_bytes_processed += bytes_processed if len(doc) == 0: continue for sentence in doc: sentence_len = len(sentence) if sentence_len == 0: continue sentlens_stream.write( sentence_len.to_bytes(4, byteorder='little', signed=True)) # token_count += sentence_len # sent_cumsum_stream.write( # token_count.to_bytes( # 8, byteorder='little', signed=True)) sent_count += 1 token_ids_stream.write( np.array(sentence, dtype=save_dtype).tobytes(order='C')) doc_cumsum_stream.write( sent_count.to_bytes(8, byteorder='little', signed=True)) if step % args.log_interval == 0: current = time.time() elapsed = current - startup_start mbs = total_bytes_processed / elapsed / 1024 / 1024 print(f"Processed {step} documents", f"({step/elapsed:.2f} docs/s, {mbs:.4f} MB/s).", file=sys.stderr) pool.close() print("Saving tokens to files...") all_doc_ids = np.frombuffer(token_ids_stream.getbuffer(), dtype=save_dtype) lens = np.frombuffer(sentlens_stream.getbuffer(), dtype=np.int32) # sents = np.frombuffer(sent_cumsum_stream.getbuffer(), dtype=np.int64) docs = np.frombuffer(doc_cumsum_stream.getbuffer(), dtype=np.int64) np.save(args.output_prefix + "_ids.npy", all_doc_ids) # np.savez(args.output_prefix + "_idx.npz", lens=lens, sents=sents, docs=docs) np.savez(args.output_prefix + "_idx.npz", lens=lens, docs=docs) print("Total sentences num: %d" % len(lens)) print("Total documents num: %d" % (len(docs) - 1)) print("Total tokens num: %d" % len(all_doc_ids)) print("Average tokens per sentence: %.2f" % (len(all_doc_ids) / len(lens))) print("Average tokens per document: %.2f" % (len(all_doc_ids) / (len(docs) - 1)))
def zstd_decompress(data, uncompressed_size): dctx = zstandard.ZstdDecompressor() return dctx.decompress(data, max_output_size=uncompressed_size)
def _zstd_open(path: Path): import zstandard as zstd # type: ignore fh = path.open('rb') dctx = zstd.ZstdDecompressor() reader = dctx.stream_reader(fh) return reader
import os import itertools import subprocess import numpy as np import fastremap import zstandard from ...types.bbox import BBox3d, Vec3d from ...types.bbox import chunking from ... import io DCMP = zstandard.ZstdDecompressor() def readchunk(cvpath, bbox, startcoord, chunksize, scratchpath, voxelres=0, layer=1, bits_per_dim=10, maxmip=11, correctvers=True): ws = io.read_cloud_volume_chunk(cvpath, bbox, mip=voxelres) chunks, chunkinds = reqdchunks(bbox, Vec3d(startcoord), Vec3d(chunksize)) for (chunk, xyz) in zip(chunks, chunkinds): chunk = chunk.translate(-bbox._min) remapchunk(ws,
print("Finding file segments...") zsdt_sig_pattern = rb"\x28\xB5\x2F\xFD" pat_regex = re.compile(zsdt_sig_pattern) offsets = [] for match_obj in pat_regex.finditer(data): offset = match_obj.start() offsets.append(offset) subdir = "./" + "uncompressed_" + archive_name if not os.path.exists(subdir): os.mkdir(subdir) dctx = zstandard.ZstdDecompressor() errors = [] for i in range(len(offsets)): start_offset = offsets[i] if (i + 1 > len(offsets) - 1): end_offset = len(data) else: end_offset = offsets[i + 1] print("Decompressing segment", start_offset, "-", end_offset) data_segment = data[start_offset:end_offset] try: decompressed = dctx.decompress(data_segment) except: errors.append([
def try_decompress_at(input_file: bytes, offset: int) -> bytes: decoded = None try: if Signature.check( input_file, offset, Signature.DTB_Appended_Qualcomm ): # Merely unpack a Qualcomm kernel file containing a magic and DTB offset at the start (so that offsets aren't wrong) dtb_offset_le = int.from_bytes(input_file[offset + 16:offset + 20], 'little') dtb_offset_be = int.from_bytes(input_file[offset + 16:offset + 20], 'big') decoded = input_file[offset + 20:offset + 20 + min(dtb_offset_le, dtb_offset_be)] elif Signature.check( input_file, offset, Signature.Android_Bootimg ): # Unpack an uncompressed Android Bootimg file, version 0, 1, 2 or 3 # See, for reference: # - https://github.com/osm0sis/mkbootimg/blob/master/unpackbootimg.c # - https://github.com/osm0sis/mkbootimg/blob/master/bootimg.h assert len(input_file) > 4096 header_version_raw = input_file[offset + 10 * 4:offset + 11 * 4] endianness = 'little' if header_version_raw in (b'\0\0\0\3', b'\3\0\0\0'): page_size = 4096 if header_version_raw == b'\0\0\0\3': endianness = 'big' else: page_size_raw = input_file[offset + 9 * 4:offset + 10 * 4] page_size_le = int.from_bytes(page_size_raw, 'little') page_size_be = int.from_bytes(page_size_raw, 'big') if page_size_le < page_size_be: page_size = page_size_le else: endianness = 'big' page_size = page_size_be kernel_size = int.from_bytes( input_file[offset + 2 * 4:offset + 3 * 4], endianness) assert len(input_file) > kernel_size > 0x1000 assert len(input_file) > page_size > 0x200 decoded = input_file[offset + page_size:offset + page_size + kernel_size] # Also try to re-unpack the output image in the case where the nested # kernel would start with a "UNCOMPRESSED_IMG" Qualcomm magic, for example decoded = try_decompress_at(decoded, 0) or decoded elif Signature.check(input_file, offset, Signature.Compressed_GZIP): decoded = SingleGzipReader(BytesIO(input_file[offset:])).read( -1 ) # GZIP - Will stop reading after the GZip footer thanks to our modification above. elif (Signature.check(input_file, offset, Signature.Compressed_XZ) or Signature.check(input_file, offset, Signature.Compressed_LZMA)): try: decoded = LZMADecompressor().decompress( input_file[offset:] ) # LZMA - Will discard the extra bytes and put it an attribute. except Exception: decoded = LZMADecompressor().decompress( input_file[offset:offset + 5] + b'\xff' * 8 + input_file[offset + 5:]) # pylzma format compatibility elif Signature.check(input_file, offset, Signature.Compressed_BZ2): decoded = BZ2Decompressor().decompress( input_file[offset:] ) # BZ2 - Will discard the extra bytes and put it an attribute. elif Signature.check(input_file, offset, Signature.Compressed_LZ4): # LZ4 support try: LZ4Decompressor = importlib.import_module('lz4.frame') except ModuleNotFoundError: logging.error('ERROR: This kernel requres LZ4 decompression.') logging.error(' But "lz4" python package was not found.') logging.error( ' Example installation command: "sudo pip3 install lz4"' ) logging.error() return context = LZ4Decompressor.create_decompression_context() decoded, bytes_read, end_of_frame = LZ4Decompressor.decompress_chunk( context, input_file[offset:]) elif Signature.check(input_file, offset, Signature.Compressed_LZ4_Legacy ): # LZ4 support (legacy format) try: from utils.lz4_legacy import decompress_lz4_buffer except ImportError: try: from vmlinux_to_elf.utils.lz4_legacy import decompress_lz4_buffer except ModuleNotFoundError: logging.error( 'ERROR: This kernel requres LZ4 decompression.') logging.error( ' But "lz4" python package was not found.') logging.error( ' Example installation command: "sudo pip3 install lz4"' ) logging.error() return decoded = decompress_lz4_buffer(BytesIO(input_file[offset:])) elif Signature.check(input_file, offset, Signature.Compressed_ZSTD): try: import zstandard as zstd except ModuleNotFoundError: logging.error('ERROR: This kernel requres ZSTD decompression.') logging.error( ' But "zstandard" python package was not found.') logging.error( ' Example installation command: "sudo pip3 install zstandard"' ) logging.error() return buf = BytesIO() context = zstd.ZstdDecompressor() for chunk in context.read_to_iter(BytesIO(input_file[offset:])): buf.write(chunk) buf.seek(0) decoded = buf.read() elif Signature.check(input_file, offset, Signature.Compressed_LZO): try: import lzo except ModuleNotFoundError: logging.error('ERROR: This kernel requres LZO decompression.') logging.error( ' But "python-lzo" python package was not found.') logging.error( ' Example installation command: "sudo pip3 install git+https://github.com/clubby789/python-lzo@b4e39df"' ) logging.error() return buf = BytesIO(input_file[offset:]) decoded = lzo.LzoFile(fileobj=buf, mode='rb').read() except Exception: pass if decoded and len(decoded) > 0x1000: logging.info(( '[+] Kernel successfully decompressed in-memory (the offsets that ' + 'follow will be given relative to the decompressed binary)')) return decoded
def zstd_decompress(body): d = zstd.ZstdDecompressor() return d.decompress(body)
def DecodeBuffer(_buffer, _offset, _outbuffer): if _offset >= len(_buffer): return -1 # if _offset + 1 + 4 + 1 + 1 > len(_buffer): return -1 ret = IsGoodLogBuffer(_buffer, _offset, 1) if not ret[0]: fixpos = GetLogStartPos(_buffer[_offset:], 1) if -1==fixpos: return -1 else: _outbuffer.extend("[F]decode_log_file.py decode error len=%d, result:%s \n"%(fixpos, ret[1])) _offset += fixpos magic_start = _buffer[_offset] if MAGIC_NO_COMPRESS_START==magic_start or MAGIC_COMPRESS_START==magic_start or MAGIC_COMPRESS_START1==magic_start: crypt_key_len = 4 elif MAGIC_COMPRESS_START2==magic_start or MAGIC_NO_COMPRESS_START1==magic_start or MAGIC_NO_COMPRESS_NO_CRYPT_START==magic_start or MAGIC_COMPRESS_NO_CRYPT_START==magic_start\ or MAGIC_SYNC_ZSTD_START==magic_start or MAGIC_SYNC_NO_CRYPT_ZSTD_START==magic_start or MAGIC_ASYNC_ZSTD_START==magic_start or MAGIC_ASYNC_NO_CRYPT_ZSTD_START==magic_start: crypt_key_len = 64 else: _outbuffer.extend('in DecodeBuffer _buffer[%d]:%d != MAGIC_NUM_START'%(_offset, magic_start)) return -1 headerLen = 1 + 2 + 1 + 1 + 4 + crypt_key_len length = struct.unpack_from("I", buffer(_buffer, _offset+headerLen-4-crypt_key_len, 4))[0] tmpbuffer = bytearray(length) seq=struct.unpack_from("H", buffer(_buffer, _offset+headerLen-4-crypt_key_len-2-2, 2))[0] begin_hour=struct.unpack_from("c", buffer(_buffer, _offset+headerLen-4-crypt_key_len-1-1, 1))[0] end_hour=struct.unpack_from("c", buffer(_buffer, _offset+headerLen-4-crypt_key_len-1, 1))[0] global lastseq if seq != 0 and seq != 1 and lastseq != 0 and seq != (lastseq+1): _outbuffer.extend("[F]decode_log_file.py log seq:%d-%d is missing\n" %(lastseq+1, seq-1)) if seq != 0: lastseq = seq tmpbuffer[:] = _buffer[_offset+headerLen:_offset+headerLen+length] try: if MAGIC_NO_COMPRESS_START1 == _buffer[_offset] or MAGIC_COMPRESS_START2 == _buffer[ _offset] or MAGIC_SYNC_ZSTD_START == _buffer[_offset] or MAGIC_ASYNC_ZSTD_START == _buffer[_offset]: print("use wrong decode script") elif MAGIC_ASYNC_NO_CRYPT_ZSTD_START == _buffer[_offset]: decompressor = zstd.ZstdDecompressor() tmpbuffer = next(decompressor.read_from(ZstdDecompressReader(str(tmpbuffer)), 100000, 1000000)) elif MAGIC_COMPRESS_START == _buffer[_offset] or MAGIC_COMPRESS_NO_CRYPT_START == _buffer[_offset]: decompressor = zlib.decompressobj(-zlib.MAX_WBITS) tmpbuffer = decompressor.decompress(str(tmpbuffer)) elif MAGIC_COMPRESS_START1 == _buffer[_offset]: decompress_data = bytearray() while len(tmpbuffer) > 0: single_log_len = struct.unpack_from("H", buffer(tmpbuffer, 0, 2))[0] decompress_data.extend(tmpbuffer[2:single_log_len + 2]) tmpbuffer[:] = tmpbuffer[single_log_len + 2:len(tmpbuffer)] decompressor = zlib.decompressobj(-zlib.MAX_WBITS) tmpbuffer = decompressor.decompress(str(decompress_data)) else: pass # _outbuffer.extend('seq:%d, hour:%d-%d len:%d decompress:%d\n' %(seq, ord(begin_hour), ord(end_hour), length, len(tmpbuffer))) except Exception as e: traceback.print_exc() _outbuffer.extend("[F]decode_log_file.py decompress err, " + str(e) + "\n") return _offset + headerLen + length + 1 _outbuffer.extend(tmpbuffer) return _offset+headerLen+length+1
def download_and_process(file_url, mode, subreddit_names, st_time): # download and pre-process original posts f_name = pjoin('reddit_tmp', file_url.split('/')[-1]) tries_left = 4 while tries_left: try: print("downloading %s %2f" % (f_name, time() - st_time)) subprocess.run(['wget', '-P', 'reddit_tmp', file_url], stdout=subprocess.PIPE) print("decompressing and filtering %s %2f" % (f_name, time() - st_time)) if f_name.split('.')[-1] == 'xz': f = lzma.open(f_name, 'rt') elif f_name.split('.')[-1] == 'bz2': f = bz2.open(f_name, 'rt') elif f_name.split('.')[-1] == 'zst': fh = open(f_name, 'rb') dctx = zstd.ZstdDecompressor() stream_reader = dctx.stream_reader(fh) f = io.TextIOWrapper(stream_reader, encoding='utf-8') lines = dict([(name, []) for name in subreddit_names]) for i, l in enumerate(f): if i % 1000000 == 0: print( "read %d lines, found %d" % (i, sum([len(ls) for ls in lines.values()])), time() - st_time) for name in subreddit_names: if name in l: lines[name] += [l.strip()] if f_name.split('.')[-1] == 'zst': fh.close() else: f.close() os.remove(f_name) tries_left = 0 except EOFError as e: sleep(10) print("failed reading file %s file, another %d tries" % (f_name, tries_left)) os.remove(f_name) tries_left -= 1 print("tokenizing and selecting %s %2f" % (f_name, time() - st_time)) processed_items = dict([(name, []) for name in subreddit_names]) if mode == 'submissions': key_list = ['id', 'score', 'url', 'title', 'selftext'] else: key_list = ['id', 'link_id', 'parent_id', 'score', 'body'] for name in subreddit_names: for line in lines[name]: reddit_dct = json.loads(line) if reddit_dct.get('num_comments', 1) > 0 and reddit_dct.get( 'score', 0) and reddit_dct.get('score', 0) >= 2 and ( mode == 'submissions' or valid_comment(reddit_dct)): reddit_res = {} for k in key_list: if k in ['title', 'selftext', 'body']: if reddit_dct[k].lower() in ['[removed]', '[deleted]']: reddit_dct[k] = '' txt, url_list = word_url_tokenize(reddit_dct[k]) reddit_res[k] = (' '.join(txt.split()), url_list) else: reddit_res[k] = reddit_dct[k] processed_items[name] += [reddit_res] print("Total found %d" % (len(processed_items)), time() - st_time) return processed_items
def test_memory_size(self): dctx = zstd.ZstdDecompressor() self.assertGreater(dctx.memory_size(), 100)
def __decompress(filePath, outputDir = None, write = True, raiseVerificationException = False): ncaHeaderSize = 0x4000 CHUNK_SZ = 0x100000 if write: if outputDir is None: nspPath = filePath[0:-1] + 'p' else: nspPath = os.path.join(outputDir, os.path.basename(filePath[0:-1] + 'p')) nspPath = os.path.abspath(nspPath) Print.info('decompressing %s -> %s' % (filePath, nspPath)) newNsp = Fs.Pfs0.Pfs0Stream(nspPath) filePath = os.path.abspath(filePath) container = Fs.factory(filePath) container.open(filePath, 'rb') for nspf in container: if isinstance(nspf, Fs.Nca.Nca) and nspf.header.contentType == Fs.Type.Content.DATA: Print.info('skipping delta fragment') continue if not nspf._path.endswith('.ncz'): verifyFile = nspf._path.endswith('.nca') and not nspf._path.endswith('.cnmt.nca') if write: f = newNsp.add(nspf._path, nspf.size) hash = hashlib.sha256() nspf.seek(0) while not nspf.eof(): inputChunk = nspf.read(CHUNK_SZ) hash.update(inputChunk) if write: f.write(inputChunk) hexHash = hash.hexdigest()[0:32] if verifyFile: if hexHash + '.nca' == nspf._path: Print.error('[VERIFIED] {0}'.format(nspf._path)) else: Print.info('[CORRUPTED] {0}'.format(nspf._path)) if raiseVerificationException: raise Exception("Verification detected hash missmatch!") elif not write: Print.info('[EXISTS] {0}'.format(nspf._path)) continue newFileName = nspf._path[0:-1] + 'a' if write: f = newNsp.add(newFileName, nspf.size) start = f.tell() blockID = 0 nspf.seek(0) header = nspf.read(ncaHeaderSize) magic = nspf.read(8) if not magic == b'NCZSECTN': raise ValueError("No NCZSECTN found! Is this really a .ncz file?") sectionCount = nspf.readInt64() sections = [] for i in range(sectionCount): sections.append(Header.Section(nspf)) pos = nspf.tell() blockMagic = nspf.read(8) nspf.seek(pos) useBlockCompression = blockMagic == b'NCZBLOCK' blockSize = -1 if useBlockCompression: BlockHeader = Header.Block(nspf) blockDecompressorReader = BlockDecompressorReader.BlockDecompressorReader(nspf, BlockHeader) pos = nspf.tell() dctx = zstandard.ZstdDecompressor() if not useBlockCompression: decompressor = dctx.stream_reader(nspf) hash = hashlib.sha256() with tqdm(total=nspf.size, unit_scale=True, unit="B/s") as bar: if write: f.write(header) bar.update(len(header)) hash.update(header) for s in sections: i = s.offset crypto = aes128.AESCTR(s.cryptoKey, s.cryptoCounter) end = s.offset + s.size while i < end: crypto.seek(i) chunkSz = 0x10000 if end - i > 0x10000 else end - i if useBlockCompression: inputChunk = blockDecompressorReader.read(chunkSz) else: inputChunk = decompressor.read(chunkSz) if not len(inputChunk): break if not useBlockCompression: decompressor.flush() if s.cryptoType in (3, 4): inputChunk = crypto.encrypt(inputChunk) if write: f.write(inputChunk) bar.update(len(inputChunk)) hash.update(inputChunk) i += len(inputChunk) hexHash = hash.hexdigest()[0:32] if hexHash + '.nca' == newFileName: Print.error('[VERIFIED] {0}'.format(nspf._path)) else: Print.info('[CORRUPTED] {0}'.format(nspf._path)) if raiseVerificationException: raise Exception("Verification detected hash missmatch") if write: end = f.tell() written = (end - start) newNsp.resize(newFileName, written) continue if write: newNsp.close()
def load_single_pbc(name): arc = open(romfs_path + '/Model/' + name + '_pbc.Nin_NX_NVN.zs', 'rb').read() arc = zstandard.ZstdDecompressor().decompress(arc) arc = sarc.SARC(arc) return pbc.PBC(arc.get_file_data(name + '.pbc'))
messages_path = sys.argv[1] bcsv_path = sys.argv[2] def fixup(name): if name.startswith('\x0e2'): name = name[6:] name = name.replace('\x0en\x1e\0', '<name>') return name output = {} # MESSAGES data = open(messages_path + '/String_EUen.sarc.zs', 'rb').read() data = zstandard.ZstdDecompressor().decompress(data) msgArc = sarc.SARC(data) output['items'] = {} outfitGroup = {} for name in sorted(msgArc.list_files()): if 'STR_ItemName' in name: m = msbt.MSBT() m.load(msgArc.get_file_data(name)) for label, index in m.labels.items(): if not label.endswith('_pl'): item_id = int(label[label.rfind('_') + 1:], 10) output['items'][item_id] = fixup(m.strings[index]) if 'STR_OutfitGroupColor' in name: m = msbt.MSBT()
def import_pbc_arc(filename): arc = open(romfs_path + '/Model/' + filename, 'rb').read() arc = zstandard.ZstdDecompressor().decompress(arc) arc = sarc.SARC(arc) for name in arc.list_files(): tile_pbcs[name] = pbc.PBC(arc.get_file_data(name))
def download_and_modify_image(): # This function downloads and edits the downloaded tar file on the fly. # It emits chunked buffers of the editted tar file, as a generator. print("Downloading from {}".format(url)) # get_session() gets us a requests.Session set to retry several times. req = get_session().get(url, stream=True) req.raise_for_status() with zstd.ZstdDecompressor().stream_reader(req.raw) as ifh: tarin = tarfile.open( mode='r|', fileobj=ifh, bufsize=zstd.DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE) # Stream through each member of the downloaded tar file individually. for member in tarin: # Non-file members only need a tar header. Emit one. if not member.isfile(): yield member.tobuf(tarfile.GNU_FORMAT) continue # Open stream reader for the member reader = tarin.extractfile(member) # If member is `repositories`, we parse and possibly rewrite the # image tags. if member.name == 'repositories': # Read and parse repositories repos = json.loads(reader.read()) reader.close() # If there is more than one image or tag, we can't handle it # here. if len(repos.keys()) > 1: raise Exception('file contains more than one image') info['image'] = image = repos.keys()[0] if len(repos[image].keys()) > 1: raise Exception('file contains more than one tag') info['tag'] = tag = repos[image].keys()[0] info['layer'] = layer = repos[image][tag] # Rewrite the repositories file data = json.dumps( {imageName or image: { imageTag or tag: layer }}) reader = BytesIO(data) member.size = len(data) # Emit the tar header for this member. yield member.tobuf(tarfile.GNU_FORMAT) # Then emit its content. remaining = member.size while remaining: length = min(remaining, zstd.DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE) buf = reader.read(length) remaining -= len(buf) yield buf # Pad to fill a 512 bytes block, per tar format. remainder = member.size % 512 if remainder: yield '\0' * (512 - remainder) reader.close()
def trigger_missing(server_address: str, out_dir: str = ".") -> None: triggered_revisions_path = os.path.join(out_dir, "triggered_revisions.zst") url = f"https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/project.relman.code-coverage.{secrets[secrets.APP_CHANNEL]}.cron.latest/artifacts/public/triggered_revisions.zst" # noqa r = requests.head(url, allow_redirects=True) if r.status_code != 404: utils.download_file(url, triggered_revisions_path) try: dctx = zstandard.ZstdDecompressor() with open(triggered_revisions_path, "rb") as zf: with dctx.stream_reader(zf) as reader: with io.TextIOWrapper(reader, encoding="ascii") as f: triggered_revisions = set(rev for rev in f.read().splitlines()) except FileNotFoundError: triggered_revisions = set() # Get all mozilla-central revisions from the past year. days = 365 if secrets[secrets.APP_CHANNEL] == "production" else 30 a_year_ago = datetime.utcnow() - timedelta(days=days) with hgmo.HGMO(server_address=server_address) as hgmo_server: data = hgmo_server.get_pushes( startDate=a_year_ago.strftime("%Y-%m-%d"), full=False, tipsonly=True) revisions = [(push_data["changesets"][0], int(push_data["date"])) for push_data in data["pushes"].values()] logger.info(f"{len(revisions)} pushes in the past year") assert (secrets[secrets.GOOGLE_CLOUD_STORAGE] is not None), "Missing GOOGLE_CLOUD_STORAGE secret" bucket = get_bucket(secrets[secrets.GOOGLE_CLOUD_STORAGE]) missing_revisions = [] for revision, timestamp in revisions: # Skip revisions that have already been triggered. If they are still missing, # it means there is a problem that is preventing us from ingesting them. if revision in triggered_revisions: continue # If the revision was already ingested, we don't need to trigger ingestion for it again. if uploader.gcp_covdir_exists(bucket, "mozilla-central", revision, "all", "all"): triggered_revisions.add(revision) continue missing_revisions.append((revision, timestamp)) logger.info(f"{len(missing_revisions)} missing pushes in the past year") yesterday = int(datetime.timestamp(datetime.utcnow() - timedelta(days=1))) task_group_id = slugId() logger.info(f"Triggering tasks in the {task_group_id} group") triggered = 0 for revision, timestamp in reversed(missing_revisions): # If it's older than yesterday, we assume the group finished. # If it is newer than yesterday, we load the group and check if all tasks in it finished. if timestamp > yesterday: decision_task_id = taskcluster.get_decision_task( "mozilla-central", revision) if decision_task_id is None: continue group = taskcluster.get_task_details( decision_task_id)["taskGroupId"] if not all(task["status"]["state"] in taskcluster.FINISHED_STATUSES for task in taskcluster.get_tasks_in_group(group) if taskcluster.is_coverage_task(task["task"])): continue trigger_task(task_group_id, revision) triggered_revisions.add(revision) triggered += 1 if triggered == MAXIMUM_TRIGGERS: break cctx = zstandard.ZstdCompressor(threads=-1) with open(triggered_revisions_path, "wb") as zf: with cctx.stream_writer(zf) as compressor: with io.TextIOWrapper(compressor, encoding="ascii") as f: f.write("\n".join(triggered_revisions))
def test_fileno_file(self): with tempfile.TemporaryFile("wb") as tf: dctx = zstd.ZstdDecompressor() writer = dctx.stream_writer(tf) self.assertEqual(writer.fileno(), tf.fileno())
def _decode_base64_data( data_text: str, layer_width: int, compression: Optional[str] = None) -> objects.TileLayerGrid: """Decode base64 data. Args: data_text: Data to be decoded. layer_width: Width of each layer in tiles. compression: The type of compression for the data. Raises: ValueError: If compression type is unsupported. Returns: objects.TileLayerGrid: Tile grid. """ tile_grid: objects.TileLayerGrid = [[]] unencoded_data = base64.b64decode(data_text) if compression == "zlib": unzipped_data = zlib.decompress(unencoded_data) elif compression == "gzip": unzipped_data = gzip.decompress(unencoded_data) elif compression == "zstd": modulename = 'zstandard' my_loader = importlib.find_loader(modulename) found = my_loader is not None if not found: raise ValueError( "Can't load 'zstd' compressed map without the 'zstandard' " "library available. Either install 'zstandard' or go to " "Map Properties and change Tile Layer Format to " "Base64, Base64 gzip, or Base64.") else: import zstandard dctx = zstandard.ZstdDecompressor() unzipped_data = dctx.decompress(unencoded_data) elif compression is None: unzipped_data = unencoded_data else: raise ValueError(f"Unsupported compression type: '{compression}'.") # Turn bytes into 4-byte integers byte_count = 0 int_count = 0 int_value = 0 row_count = 0 for byte in unzipped_data: int_value += byte << (byte_count * 8) byte_count += 1 if not byte_count % 4: byte_count = 0 int_count += 1 tile_grid[row_count].append(int_value) int_value = 0 if not int_count % layer_width: row_count += 1 tile_grid.append([]) tile_grid.pop() return tile_grid
def polydata_list_from_json(js, manager=None): """Deserialize a Javascript vtk.js PolyData object. Decompresses data buffers. """ if js is None: return None else: decompressor = zstd.ZstdDecompressor() polydata_list = [] for json_polydata in js: polydata = dict() for top_key, top_value in json_polydata.items(): if isinstance(top_value, dict): nested_value_copy = dict() for nested_key, nested_value in top_value.items(): if not nested_key == 'compressedValues': nested_value_copy[nested_key] = nested_value polydata[top_key] = nested_value_copy else: polydata[top_key] = top_value if 'points' in polydata: dtype = _type_to_numpy(polydata['points']['dataType']) if six.PY2: asBytes = json_polydata['points'][ 'compressedValues'].tobytes() valuesBufferArrayCompressed = np.frombuffer(asBytes, dtype=np.uint8) else: valuesBufferArrayCompressed = np.frombuffer( json_polydata['points']['compressedValues'], dtype=np.uint8) numberOfBytes = json_polydata['points']['size'] * np.dtype( dtype).itemsize valuesBufferArray = \ np.frombuffer(decompressor.decompress(valuesBufferArrayCompressed, numberOfBytes), dtype=dtype) valuesBufferArray.shape = (int( json_polydata['points']['size'] / 3), 3) polydata['points']['values'] = valuesBufferArray for cell_type in ['verts', 'lines', 'polys', 'strips']: if cell_type in polydata: dtype = _type_to_numpy(polydata[cell_type]['dataType']) if six.PY2: asBytes = json_polydata[cell_type][ 'compressedValues'].tobytes() valuesBufferArrayCompressed = np.frombuffer( asBytes, dtype=np.uint8) else: valuesBufferArrayCompressed = np.frombuffer( json_polydata[cell_type]['compressedValues'], dtype=np.uint8) numberOfBytes = json_polydata[cell_type][ 'size'] * np.dtype(dtype).itemsize valuesBufferArray = \ np.frombuffer(decompressor.decompress(valuesBufferArrayCompressed, numberOfBytes), dtype=dtype) valuesBufferArray.shape = ( json_polydata[cell_type]['size'], ) polydata[cell_type]['values'] = valuesBufferArray for data_type in ['pointData', 'cellData']: if data_type in polydata: data = json_polydata[data_type] decompressed_data = dict() for nested_key, nested_value in data.items(): if not nested_key == 'arrays': decompressed_data[nested_key] = nested_value decompressed_arrays = [] for array in json_polydata[data_type]['arrays']: decompressed_array = dict() for nested_key, nested_value in array['data'].items(): if not nested_key == 'compressedValues': decompressed_array[nested_key] = nested_value dtype = _type_to_numpy(decompressed_array['dataType']) if six.PY2: asBytes = array['data'][ 'compressedValues'].tobytes() valuesBufferArrayCompressed = np.frombuffer( asBytes, dtype=np.uint8) else: valuesBufferArrayCompressed = np.frombuffer( array['data']['compressedValues'], dtype=np.uint8) numberOfBytes = decompressed_array['size'] * np.dtype( dtype).itemsize valuesBufferArray = \ np.frombuffer(decompressor.decompress(valuesBufferArrayCompressed, numberOfBytes), dtype=dtype) valuesBufferArray.shape = ( decompressed_array['size'], ) decompressed_array['values'] = valuesBufferArray decompressed_arrays.append( {'data': decompressed_array}) decompressed_data['arrays'] = decompressed_arrays polydata[data_type] = decompressed_data polydata_list.append(polydata) return polydata_list
def zstd_decompress(content): ctx = zstd.ZstdDecompressor() return ctx.decompress(content)
def zstd_decompress(path): dctx = zstandard.ZstdDecompressor() with open(f"{path}.zst", "rb") as input_f: with open(path, "wb") as output_f: dctx.copy_stream(input_f, output_f)
def download_manifest( manifest_path, # type: Text tags_func, # type: Callable[[], List[Text]] url_func, # type: Callable[[List[Text]], Optional[List[Text]]] force=False # type: bool ): # type: (...) -> bool if not force and not should_download(manifest_path): return False tags = tags_func() urls = url_func(tags) if not urls: logger.warning("No generated manifest found") return False for url in urls: logger.info("Downloading manifest from %s" % url) try: resp = urlopen(url) except Exception: logger.warning("Downloading pregenerated manifest failed") continue if resp.code != 200: logger.warning( "Downloading pregenerated manifest failed; got HTTP status %d" % resp.code) continue if url.endswith(".zst"): if not zstandard: continue try: dctx = zstandard.ZstdDecompressor() decompressed = dctx.decompress(resp.read()) except IOError: logger.warning("Failed to decompress downloaded file") continue elif url.endswith(".bz2"): try: decompressed = bz2.decompress(resp.read()) except IOError: logger.warning("Failed to decompress downloaded file") continue elif url.endswith(".gz"): fileobj = io.BytesIO(resp.read()) try: with gzip.GzipFile(fileobj=fileobj) as gzf: data = read_gzf(gzf) # type: ignore decompressed = data except IOError: logger.warning("Failed to decompress downloaded file") continue else: logger.warning("Unknown file extension: %s" % url) continue break else: return False try: with open(manifest_path, "wb") as f: f.write(decompressed) except Exception: logger.warning("Failed to write manifest") return False logger.info("Manifest downloaded") return True
flags = input[7] & 0x0F input = input[8:] aesKey = unwrapKey(input[0:256], args.privatekeys) input = input[256:] len = int.from_bytes(input[0:8], byteorder='little') cipher = AES.new(aesKey, AES.MODE_ECB) input = cipher.decrypt(input[8:]) if flags == 0x0E: #zlib input = zlib.decompress(input[0:len]) elif flags == 0x0D: #zstd dctx = zstd.ZstdDecompressor() input = dctx.decompress(input[0:len]) elif flags == 0x00: #plaintext input = input[0:len] else: raise IOError('invalid flag') with open(args.output, 'wb') as f: f.write(input) else: if args.zlib: flag = 0x0E print('compressing with zlib') buf = zlib.compress(input, 9) elif args.zstd: flag = 0x0D
def __init__(self): dctx = zstd.ZstdDecompressor() self._decomp = dctx.decompressobj() self.eof = False
def test_bad_write_size(self): dctx = zstd.ZstdDecompressor() with self.assertRaisesRegex(ValueError, "write_size must be positive"): dctx.decompressobj(write_size=0)
def test_simple(self): data = zstd.ZstdCompressor(level=1).compress(b'foobar') dctx = zstd.ZstdDecompressor() dobj = dctx.decompressobj() self.assertEqual(dobj.decompress(data), b'foobar')