Example #1
0
def decompress_via_writer(data):
    buffer = io.BytesIO()
    dctx = zstd.ZstdDecompressor()
    with dctx.stream_writer(buffer) as decompressor:
        decompressor.write(data)
    return buffer.getvalue()
Example #2
0
def extract_tar_zst(path):
    dctx = zstandard.ZstdDecompressor()
    with open(f"{path}.zst", "rb") as f:
        with dctx.stream_reader(f) as reader:
            with tarfile.open(mode="r|", fileobj=reader) as tar:
                tar.extractall()
def test_trigger_from_preexisting(monkeypatch, tmpdir, mock_secrets,
                                  mock_taskcluster, mock_phabricator,
                                  fake_hg_repo):
    tmp_path = tmpdir.strpath

    hg, local, remote = fake_hg_repo

    add_file(hg, local, "file", "1\n2\n3\n4\n")
    commit(hg, 1)

    add_file(hg, local, "file", "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n")
    revision2 = commit(hg, 2)

    hg.push(dest=bytes(remote, "ascii"))

    add_file(hg, local, "file2", "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n")
    revision3 = commit(hg, 2)

    hg.push(dest=bytes(remote, "ascii"))

    add_file(hg, local, "file3", "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n")
    revision4 = commit(hg, 2)

    hg.push(dest=bytes(remote, "ascii"))

    responses.add(
        responses.HEAD,
        "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/project.relman.code-coverage.production.cron.latest/artifacts/public/triggered_revisions.zst",
        status=200,
    )

    responses.add(
        responses.GET,
        "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/project.relman.code-coverage.production.cron.latest/artifacts/public/triggered_revisions.zst",
        status=200,
        body=zstandard.ZstdCompressor().compress(
            f"{revision2}\n{revision3}".encode("ascii")),
    )

    copy_pushlog_database(remote, local)

    myBucket = {}

    def get_bucket(acc):
        return myBucket

    monkeypatch.setattr(trigger_missing, "get_bucket", get_bucket)

    gcp_covdir_exists_calls = 0

    def gcp_covdir_exists(bucket, repository, revision, platform, suite):
        nonlocal gcp_covdir_exists_calls
        gcp_covdir_exists_calls += 1
        assert bucket == myBucket
        assert repository == "mozilla-central"
        assert platform == "all"
        assert suite == "all"
        return revision == revision3

    monkeypatch.setattr(uploader, "gcp_covdir_exists", gcp_covdir_exists)

    def slugId():
        return "myGroupId"

    monkeypatch.setattr(trigger_missing, "slugId", slugId)

    trigger_hook_calls = 0

    def get_service(serv):
        assert serv == "hooks"

        class HooksService:
            def triggerHook(self, hook_group, hook_id, payload):
                nonlocal trigger_hook_calls
                assert hook_group == "project-relman"
                assert hook_id == "code-coverage-repo-production"
                assert payload == {
                    "REPOSITORY": "https://hg.mozilla.org/mozilla-central",
                    "REVISION": revision4,
                    "taskGroupId": "myGroupId",
                    "taskName": f"covdir for {revision4}",
                }
                trigger_hook_calls += 1

        return HooksService()

    monkeypatch.setattr(taskcluster_config, "get_service", get_service)

    get_decision_task_calls = 0

    def get_decision_task(branch, revision):
        nonlocal get_decision_task_calls
        assert branch == "mozilla-central"
        assert revision == revision4
        get_decision_task_calls += 1
        return f"decisionTask-{revision}"

    monkeypatch.setattr(taskcluster, "get_decision_task", get_decision_task)

    get_task_details_calls = 0

    def get_task_details(decision_task_id):
        nonlocal get_task_details_calls
        assert decision_task_id == f"decisionTask-{revision4}"
        get_task_details_calls += 1
        return {"taskGroupId": f"decisionTaskGroup-{revision4}"}

    monkeypatch.setattr(taskcluster, "get_task_details", get_task_details)

    get_tasks_in_group_calls = 0

    def get_tasks_in_group(group_id):
        nonlocal get_tasks_in_group_calls
        assert group_id == f"decisionTaskGroup-{revision4}"
        get_tasks_in_group_calls += 1
        return [{
            "status": {
                "state": "completed",
            },
            "task": {
                "metadata": {
                    "name": "build-linux64-ccov/opt",
                }
            },
        }]

    monkeypatch.setattr(taskcluster, "get_tasks_in_group", get_tasks_in_group)

    with hgmo.HGMO(repo_dir=local) as hgmo_server:
        trigger_missing.trigger_missing(hgmo_server.server_address,
                                        out_dir=tmp_path)

    assert gcp_covdir_exists_calls == 1
    assert trigger_hook_calls == 1
    assert get_decision_task_calls == 1
    assert get_task_details_calls == 1
    assert get_tasks_in_group_calls == 1

    dctx = zstandard.ZstdDecompressor()
    with open(os.path.join(tmp_path, "triggered_revisions.zst"), "rb") as zf:
        with dctx.stream_reader(zf) as reader:
            with io.TextIOWrapper(reader, encoding="ascii") as f:
                result = set(rev for rev in f.read().splitlines())

    assert result == {revision2, revision3, revision4}
    def test_io_api(self):
        buffer = io.BytesIO()
        dctx = zstd.ZstdDecompressor()
        writer = dctx.stream_writer(buffer)

        self.assertFalse(writer.closed)
        self.assertFalse(writer.isatty())
        self.assertFalse(writer.readable())

        with self.assertRaises(io.UnsupportedOperation):
            writer.__iter__()

        with self.assertRaises(io.UnsupportedOperation):
            writer.__next__()

        with self.assertRaises(io.UnsupportedOperation):
            writer.readline()

        with self.assertRaises(io.UnsupportedOperation):
            writer.readline(42)

        with self.assertRaises(io.UnsupportedOperation):
            writer.readline(size=42)

        with self.assertRaises(io.UnsupportedOperation):
            writer.readlines()

        with self.assertRaises(io.UnsupportedOperation):
            writer.readlines(42)

        with self.assertRaises(io.UnsupportedOperation):
            writer.readlines(hint=42)

        with self.assertRaises(io.UnsupportedOperation):
            writer.seek(0)

        with self.assertRaises(io.UnsupportedOperation):
            writer.seek(10, os.SEEK_SET)

        self.assertFalse(writer.seekable())

        with self.assertRaises(io.UnsupportedOperation):
            writer.tell()

        with self.assertRaises(io.UnsupportedOperation):
            writer.truncate()

        with self.assertRaises(io.UnsupportedOperation):
            writer.truncate(42)

        with self.assertRaises(io.UnsupportedOperation):
            writer.truncate(size=42)

        self.assertTrue(writer.writable())

        with self.assertRaises(io.UnsupportedOperation):
            writer.writelines([])

        with self.assertRaises(io.UnsupportedOperation):
            writer.read()

        with self.assertRaises(io.UnsupportedOperation):
            writer.read(42)

        with self.assertRaises(io.UnsupportedOperation):
            writer.read(size=42)

        with self.assertRaises(io.UnsupportedOperation):
            writer.readall()

        with self.assertRaises(io.UnsupportedOperation):
            writer.readinto(None)

        with self.assertRaises(io.UnsupportedOperation):
            writer.fileno()
def main():
    args = get_args()

    file_paths = []
    if os.path.isfile(args.input_path):
        file_paths.append(args.input_path)
    else:
        for root, _, fs in os.walk(args.input_path):
            for f in fs:
                file_paths.append(os.path.join(root, f))
    convert = Converter(args)

    # Try tokenizer is availiable
    sample_tokenizer = getattr(tfs, args.tokenizer_name).from_pretrained(
        args.model_name)
    if sample_tokenizer.vocab_size < 2**16 - 1:
        save_dtype = np.uint16
    else:
        save_dtype = np.int32

    pool = multiprocessing.Pool(args.workers, initializer=convert.initializer)

    # We use BytesIO to store the ids.
    token_ids_stream = io.BytesIO()
    sentlens_stream = io.BytesIO()
    # # Cumsum on tokens num
    # sent_cumsum_stream = io.BytesIO()
    # sent_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True))
    # Cunsum on document on every sentence num, type=np.int64
    doc_cumsum_stream = io.BytesIO()
    doc_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True))

    sent_count = 0
    # token_count = 0

    file_paths.sort()

    step = 0
    total_bytes_processed = 0
    startup_start = time.time()
    for file_path in tqdm(file_paths):
        if file_path.endswith(".zst"):
            import zstandard
            cctx = zstandard.ZstdDecompressor()
            fh = open(file_path, 'rb')
            text = io.BufferedReader(cctx.stream_reader(fh))
        elif file_path.endswith(".jsonl"):
            text = open(file_path, 'r', encoding='utf-8')
        else:
            print("Unexpected data format, skiped %s" % file_path)
            continue

        encoded_docs = pool.imap(convert.encode, text, 256)
        print("Processing %s" % file_path)
        for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
            step += 1
            total_bytes_processed += bytes_processed
            if len(doc) == 0:
                continue

            for sentence in doc:
                sentence_len = len(sentence)
                if sentence_len == 0:
                    continue
                sentlens_stream.write(
                    sentence_len.to_bytes(4, byteorder='little', signed=True))
                # token_count += sentence_len
                # sent_cumsum_stream.write(
                #     token_count.to_bytes(
                #         8, byteorder='little', signed=True))
                sent_count += 1
                token_ids_stream.write(
                    np.array(sentence, dtype=save_dtype).tobytes(order='C'))

            doc_cumsum_stream.write(
                sent_count.to_bytes(8, byteorder='little', signed=True))

            if step % args.log_interval == 0:
                current = time.time()
                elapsed = current - startup_start
                mbs = total_bytes_processed / elapsed / 1024 / 1024
                print(f"Processed {step} documents",
                      f"({step/elapsed:.2f} docs/s, {mbs:.4f} MB/s).",
                      file=sys.stderr)

    pool.close()
    print("Saving tokens to files...")
    all_doc_ids = np.frombuffer(token_ids_stream.getbuffer(), dtype=save_dtype)
    lens = np.frombuffer(sentlens_stream.getbuffer(), dtype=np.int32)
    # sents = np.frombuffer(sent_cumsum_stream.getbuffer(), dtype=np.int64)
    docs = np.frombuffer(doc_cumsum_stream.getbuffer(), dtype=np.int64)
    np.save(args.output_prefix + "_ids.npy", all_doc_ids)
    # np.savez(args.output_prefix + "_idx.npz", lens=lens, sents=sents, docs=docs)
    np.savez(args.output_prefix + "_idx.npz", lens=lens, docs=docs)

    print("Total sentences num: %d" % len(lens))
    print("Total documents num: %d" % (len(docs) - 1))
    print("Total tokens num: %d" % len(all_doc_ids))
    print("Average tokens per sentence: %.2f" % (len(all_doc_ids) / len(lens)))
    print("Average tokens per document: %.2f" % (len(all_doc_ids) /
                                                 (len(docs) - 1)))
Example #6
0
 def zstd_decompress(data, uncompressed_size):
     dctx = zstandard.ZstdDecompressor()
     return dctx.decompress(data, max_output_size=uncompressed_size)
Example #7
0
def _zstd_open(path: Path):
    import zstandard as zstd  # type: ignore
    fh = path.open('rb')
    dctx = zstd.ZstdDecompressor()
    reader = dctx.stream_reader(fh)
    return reader
Example #8
0
import os
import itertools
import subprocess

import numpy as np
import fastremap
import zstandard

from ...types.bbox import BBox3d, Vec3d
from ...types.bbox import chunking
from ... import io

DCMP = zstandard.ZstdDecompressor()


def readchunk(cvpath,
              bbox,
              startcoord,
              chunksize,
              scratchpath,
              voxelres=0,
              layer=1,
              bits_per_dim=10,
              maxmip=11,
              correctvers=True):
    ws = io.read_cloud_volume_chunk(cvpath, bbox, mip=voxelres)

    chunks, chunkinds = reqdchunks(bbox, Vec3d(startcoord), Vec3d(chunksize))
    for (chunk, xyz) in zip(chunks, chunkinds):
        chunk = chunk.translate(-bbox._min)
        remapchunk(ws,
print("Finding file segments...")
zsdt_sig_pattern = rb"\x28\xB5\x2F\xFD"
pat_regex = re.compile(zsdt_sig_pattern)

offsets = []
for match_obj in pat_regex.finditer(data):
    offset = match_obj.start()
    offsets.append(offset)

subdir = "./" + "uncompressed_" + archive_name

if not os.path.exists(subdir):
    os.mkdir(subdir)

dctx = zstandard.ZstdDecompressor()
errors = []
for i in range(len(offsets)):
    start_offset = offsets[i]

    if (i + 1 > len(offsets) - 1):
        end_offset = len(data)
    else:
        end_offset = offsets[i + 1]

    print("Decompressing segment", start_offset, "-", end_offset)
    data_segment = data[start_offset:end_offset]
    try:
        decompressed = dctx.decompress(data_segment)
    except:
        errors.append([
Example #10
0
def try_decompress_at(input_file: bytes, offset: int) -> bytes:

    decoded = None
    try:

        if Signature.check(
                input_file, offset, Signature.DTB_Appended_Qualcomm
        ):  # Merely unpack a Qualcomm kernel file containing a magic and DTB offset at the start (so that offsets aren't wrong)

            dtb_offset_le = int.from_bytes(input_file[offset + 16:offset + 20],
                                           'little')
            dtb_offset_be = int.from_bytes(input_file[offset + 16:offset + 20],
                                           'big')

            decoded = input_file[offset + 20:offset + 20 +
                                 min(dtb_offset_le, dtb_offset_be)]

        elif Signature.check(
                input_file, offset, Signature.Android_Bootimg
        ):  # Unpack an uncompressed Android Bootimg file, version 0, 1, 2 or 3

            # See, for reference:
            # - https://github.com/osm0sis/mkbootimg/blob/master/unpackbootimg.c
            # - https://github.com/osm0sis/mkbootimg/blob/master/bootimg.h

            assert len(input_file) > 4096

            header_version_raw = input_file[offset + 10 * 4:offset + 11 * 4]

            endianness = 'little'

            if header_version_raw in (b'\0\0\0\3', b'\3\0\0\0'):
                page_size = 4096

                if header_version_raw == b'\0\0\0\3':
                    endianness = 'big'

            else:
                page_size_raw = input_file[offset + 9 * 4:offset + 10 * 4]

                page_size_le = int.from_bytes(page_size_raw, 'little')
                page_size_be = int.from_bytes(page_size_raw, 'big')

                if page_size_le < page_size_be:
                    page_size = page_size_le
                else:
                    endianness = 'big'
                    page_size = page_size_be

            kernel_size = int.from_bytes(
                input_file[offset + 2 * 4:offset + 3 * 4], endianness)

            assert len(input_file) > kernel_size > 0x1000
            assert len(input_file) > page_size > 0x200

            decoded = input_file[offset + page_size:offset + page_size +
                                 kernel_size]

            # Also try to re-unpack the output image in the case where the nested
            # kernel would start with a "UNCOMPRESSED_IMG" Qualcomm magic, for example

            decoded = try_decompress_at(decoded, 0) or decoded

        elif Signature.check(input_file, offset, Signature.Compressed_GZIP):
            decoded = SingleGzipReader(BytesIO(input_file[offset:])).read(
                -1
            )  # GZIP - Will stop reading after the GZip footer thanks to our modification above.

        elif (Signature.check(input_file, offset, Signature.Compressed_XZ) or
              Signature.check(input_file, offset, Signature.Compressed_LZMA)):
            try:
                decoded = LZMADecompressor().decompress(
                    input_file[offset:]
                )  # LZMA - Will discard the extra bytes and put it an attribute.

            except Exception:
                decoded = LZMADecompressor().decompress(
                    input_file[offset:offset + 5] + b'\xff' * 8 +
                    input_file[offset + 5:])  # pylzma format compatibility

        elif Signature.check(input_file, offset, Signature.Compressed_BZ2):
            decoded = BZ2Decompressor().decompress(
                input_file[offset:]
            )  # BZ2 - Will discard the extra bytes and put it an attribute.

        elif Signature.check(input_file, offset,
                             Signature.Compressed_LZ4):  # LZ4 support
            try:
                LZ4Decompressor = importlib.import_module('lz4.frame')

            except ModuleNotFoundError:
                logging.error('ERROR: This kernel requres LZ4 decompression.')
                logging.error('       But "lz4" python package was not found.')
                logging.error(
                    '       Example installation command: "sudo pip3 install lz4"'
                )
                logging.error()
                return

            context = LZ4Decompressor.create_decompression_context()
            decoded, bytes_read, end_of_frame = LZ4Decompressor.decompress_chunk(
                context, input_file[offset:])

        elif Signature.check(input_file, offset,
                             Signature.Compressed_LZ4_Legacy
                             ):  # LZ4 support (legacy format)

            try:
                from utils.lz4_legacy import decompress_lz4_buffer
            except ImportError:
                try:
                    from vmlinux_to_elf.utils.lz4_legacy import decompress_lz4_buffer
                except ModuleNotFoundError:
                    logging.error(
                        'ERROR: This kernel requres LZ4 decompression.')
                    logging.error(
                        '       But "lz4" python package was not found.')
                    logging.error(
                        '       Example installation command: "sudo pip3 install lz4"'
                    )
                    logging.error()
                    return

            decoded = decompress_lz4_buffer(BytesIO(input_file[offset:]))

        elif Signature.check(input_file, offset, Signature.Compressed_ZSTD):
            try:
                import zstandard as zstd
            except ModuleNotFoundError:
                logging.error('ERROR: This kernel requres ZSTD decompression.')
                logging.error(
                    '       But "zstandard" python package was not found.')
                logging.error(
                    '       Example installation command: "sudo pip3 install zstandard"'
                )
                logging.error()
                return
            buf = BytesIO()
            context = zstd.ZstdDecompressor()
            for chunk in context.read_to_iter(BytesIO(input_file[offset:])):
                buf.write(chunk)
            buf.seek(0)
            decoded = buf.read()

        elif Signature.check(input_file, offset, Signature.Compressed_LZO):
            try:
                import lzo
            except ModuleNotFoundError:
                logging.error('ERROR: This kernel requres LZO decompression.')
                logging.error(
                    '       But "python-lzo" python package was not found.')
                logging.error(
                    '       Example installation command: "sudo pip3 install git+https://github.com/clubby789/python-lzo@b4e39df"'
                )
                logging.error()
                return
            buf = BytesIO(input_file[offset:])
            decoded = lzo.LzoFile(fileobj=buf, mode='rb').read()
    except Exception:
        pass

    if decoded and len(decoded) > 0x1000:
        logging.info((
            '[+] Kernel successfully decompressed in-memory (the offsets that '
            + 'follow will be given relative to the decompressed binary)'))

        return decoded
Example #11
0
 def zstd_decompress(body):
     d = zstd.ZstdDecompressor()
     return d.decompress(body)
Example #12
0
def DecodeBuffer(_buffer, _offset, _outbuffer):
    
    if _offset >= len(_buffer): return -1
    # if _offset + 1 + 4 + 1 + 1 > len(_buffer): return -1
    ret = IsGoodLogBuffer(_buffer, _offset, 1)
    if not ret[0]:
        fixpos = GetLogStartPos(_buffer[_offset:], 1)
        if -1==fixpos: 
            return -1
        else:
            _outbuffer.extend("[F]decode_log_file.py decode error len=%d, result:%s \n"%(fixpos, ret[1]))
            _offset += fixpos 

    magic_start = _buffer[_offset]
    if MAGIC_NO_COMPRESS_START==magic_start or MAGIC_COMPRESS_START==magic_start or MAGIC_COMPRESS_START1==magic_start:
        crypt_key_len = 4
    elif MAGIC_COMPRESS_START2==magic_start or MAGIC_NO_COMPRESS_START1==magic_start or MAGIC_NO_COMPRESS_NO_CRYPT_START==magic_start or MAGIC_COMPRESS_NO_CRYPT_START==magic_start\
            or MAGIC_SYNC_ZSTD_START==magic_start or MAGIC_SYNC_NO_CRYPT_ZSTD_START==magic_start or MAGIC_ASYNC_ZSTD_START==magic_start or MAGIC_ASYNC_NO_CRYPT_ZSTD_START==magic_start:
        crypt_key_len = 64
    else:
        _outbuffer.extend('in DecodeBuffer _buffer[%d]:%d != MAGIC_NUM_START'%(_offset, magic_start))
        return -1

    headerLen = 1 + 2 + 1 + 1 + 4 + crypt_key_len
    length = struct.unpack_from("I", buffer(_buffer, _offset+headerLen-4-crypt_key_len, 4))[0]
    tmpbuffer = bytearray(length)

    seq=struct.unpack_from("H", buffer(_buffer, _offset+headerLen-4-crypt_key_len-2-2, 2))[0]
    begin_hour=struct.unpack_from("c", buffer(_buffer, _offset+headerLen-4-crypt_key_len-1-1, 1))[0]
    end_hour=struct.unpack_from("c", buffer(_buffer, _offset+headerLen-4-crypt_key_len-1, 1))[0]

    global lastseq
    if seq != 0 and seq != 1 and lastseq != 0 and seq != (lastseq+1):
        _outbuffer.extend("[F]decode_log_file.py log seq:%d-%d is missing\n" %(lastseq+1, seq-1))

    if seq != 0:
        lastseq = seq

    tmpbuffer[:] = _buffer[_offset+headerLen:_offset+headerLen+length]

    try:

        if MAGIC_NO_COMPRESS_START1 == _buffer[_offset] or MAGIC_COMPRESS_START2 == _buffer[
            _offset] or MAGIC_SYNC_ZSTD_START == _buffer[_offset] or MAGIC_ASYNC_ZSTD_START == _buffer[_offset]:
            print("use wrong decode script")
        elif MAGIC_ASYNC_NO_CRYPT_ZSTD_START == _buffer[_offset]:
            decompressor = zstd.ZstdDecompressor()
            tmpbuffer = next(decompressor.read_from(ZstdDecompressReader(str(tmpbuffer)), 100000, 1000000))
        elif MAGIC_COMPRESS_START == _buffer[_offset] or MAGIC_COMPRESS_NO_CRYPT_START == _buffer[_offset]:
            decompressor = zlib.decompressobj(-zlib.MAX_WBITS)
            tmpbuffer = decompressor.decompress(str(tmpbuffer))
        elif MAGIC_COMPRESS_START1 == _buffer[_offset]:
            decompress_data = bytearray()
            while len(tmpbuffer) > 0:
                single_log_len = struct.unpack_from("H", buffer(tmpbuffer, 0, 2))[0]
                decompress_data.extend(tmpbuffer[2:single_log_len + 2])
                tmpbuffer[:] = tmpbuffer[single_log_len + 2:len(tmpbuffer)]

            decompressor = zlib.decompressobj(-zlib.MAX_WBITS)
            tmpbuffer = decompressor.decompress(str(decompress_data))

        else:
            pass

            # _outbuffer.extend('seq:%d, hour:%d-%d len:%d decompress:%d\n' %(seq, ord(begin_hour), ord(end_hour), length, len(tmpbuffer)))
    except Exception as e:
        traceback.print_exc()
        _outbuffer.extend("[F]decode_log_file.py decompress err, " + str(e) + "\n")
        return _offset + headerLen + length + 1

    _outbuffer.extend(tmpbuffer)
    
    return _offset+headerLen+length+1
def download_and_process(file_url, mode, subreddit_names, st_time):
    # download and pre-process original posts
    f_name = pjoin('reddit_tmp', file_url.split('/')[-1])
    tries_left = 4
    while tries_left:
        try:
            print("downloading %s %2f" % (f_name, time() - st_time))
            subprocess.run(['wget', '-P', 'reddit_tmp', file_url],
                           stdout=subprocess.PIPE)
            print("decompressing and filtering %s %2f" %
                  (f_name, time() - st_time))
            if f_name.split('.')[-1] == 'xz':
                f = lzma.open(f_name, 'rt')
            elif f_name.split('.')[-1] == 'bz2':
                f = bz2.open(f_name, 'rt')
            elif f_name.split('.')[-1] == 'zst':
                fh = open(f_name, 'rb')
                dctx = zstd.ZstdDecompressor()
                stream_reader = dctx.stream_reader(fh)
                f = io.TextIOWrapper(stream_reader, encoding='utf-8')
            lines = dict([(name, []) for name in subreddit_names])
            for i, l in enumerate(f):
                if i % 1000000 == 0:
                    print(
                        "read %d lines, found %d" %
                        (i, sum([len(ls) for ls in lines.values()])),
                        time() - st_time)
                for name in subreddit_names:
                    if name in l:
                        lines[name] += [l.strip()]
            if f_name.split('.')[-1] == 'zst':
                fh.close()
            else:
                f.close()
            os.remove(f_name)
            tries_left = 0
        except EOFError as e:
            sleep(10)
            print("failed reading file %s file, another %d tries" %
                  (f_name, tries_left))
            os.remove(f_name)
            tries_left -= 1
    print("tokenizing and selecting %s %2f" % (f_name, time() - st_time))
    processed_items = dict([(name, []) for name in subreddit_names])
    if mode == 'submissions':
        key_list = ['id', 'score', 'url', 'title', 'selftext']
    else:
        key_list = ['id', 'link_id', 'parent_id', 'score', 'body']
    for name in subreddit_names:
        for line in lines[name]:
            reddit_dct = json.loads(line)
            if reddit_dct.get('num_comments', 1) > 0 and reddit_dct.get(
                    'score', 0) and reddit_dct.get('score', 0) >= 2 and (
                        mode == 'submissions' or valid_comment(reddit_dct)):
                reddit_res = {}
                for k in key_list:
                    if k in ['title', 'selftext', 'body']:
                        if reddit_dct[k].lower() in ['[removed]', '[deleted]']:
                            reddit_dct[k] = ''
                        txt, url_list = word_url_tokenize(reddit_dct[k])
                        reddit_res[k] = (' '.join(txt.split()), url_list)
                    else:
                        reddit_res[k] = reddit_dct[k]
                processed_items[name] += [reddit_res]
    print("Total found %d" % (len(processed_items)), time() - st_time)
    return processed_items
Example #14
0
    def test_memory_size(self):
        dctx = zstd.ZstdDecompressor()

        self.assertGreater(dctx.memory_size(), 100)
Example #15
0
def __decompress(filePath, outputDir = None, write = True, raiseVerificationException = False):
	
	ncaHeaderSize = 0x4000
	CHUNK_SZ = 0x100000
	
	if write:
		if outputDir is None:
			nspPath = filePath[0:-1] + 'p'
		else:
			nspPath = os.path.join(outputDir, os.path.basename(filePath[0:-1] + 'p'))
			
		nspPath = os.path.abspath(nspPath)
		
		Print.info('decompressing %s -> %s' % (filePath, nspPath))
		
		newNsp = Fs.Pfs0.Pfs0Stream(nspPath)
	
	filePath = os.path.abspath(filePath)
	container = Fs.factory(filePath)
	
	container.open(filePath, 'rb')
	
	
	for nspf in container:
		if isinstance(nspf, Fs.Nca.Nca) and nspf.header.contentType == Fs.Type.Content.DATA:
			Print.info('skipping delta fragment')
			continue

		if not nspf._path.endswith('.ncz'):
			verifyFile = nspf._path.endswith('.nca') and not nspf._path.endswith('.cnmt.nca')
			if write:
				f = newNsp.add(nspf._path, nspf.size)
			hash = hashlib.sha256()
			nspf.seek(0)
			while not nspf.eof():
				inputChunk = nspf.read(CHUNK_SZ)
				hash.update(inputChunk)
				if write:
					f.write(inputChunk)
			hexHash = hash.hexdigest()[0:32]
			if verifyFile:
				if hexHash + '.nca' == nspf._path:
					Print.error('[VERIFIED]   {0}'.format(nspf._path))
				else:
					Print.info('[CORRUPTED]  {0}'.format(nspf._path))
					if raiseVerificationException:
						raise Exception("Verification detected hash missmatch!")
			elif not write:
				Print.info('[EXISTS]     {0}'.format(nspf._path))
			continue

		newFileName = nspf._path[0:-1] + 'a'
		if write:
			f = newNsp.add(newFileName, nspf.size)
			start = f.tell()
		blockID = 0
		nspf.seek(0)
		
		header = nspf.read(ncaHeaderSize)
		magic = nspf.read(8)
		if not magic == b'NCZSECTN':
			raise ValueError("No NCZSECTN found! Is this really a .ncz file?")
		sectionCount = nspf.readInt64()
		sections = []
		for i in range(sectionCount):
			sections.append(Header.Section(nspf))

		pos = nspf.tell()
		blockMagic = nspf.read(8)
		nspf.seek(pos)
		useBlockCompression = blockMagic == b'NCZBLOCK'
		
		blockSize = -1
		if useBlockCompression:
			BlockHeader = Header.Block(nspf)
			blockDecompressorReader = BlockDecompressorReader.BlockDecompressorReader(nspf, BlockHeader)
		pos = nspf.tell()

		dctx = zstandard.ZstdDecompressor()
		if not useBlockCompression:
			decompressor = dctx.stream_reader(nspf)

		hash = hashlib.sha256()
		with tqdm(total=nspf.size, unit_scale=True, unit="B/s") as bar:
			if write:
				f.write(header)
			bar.update(len(header))
			hash.update(header)
			
			for s in sections:
				i = s.offset
				
				crypto = aes128.AESCTR(s.cryptoKey, s.cryptoCounter)
				end = s.offset + s.size
				
				while i < end:
					crypto.seek(i)
					chunkSz = 0x10000 if end - i > 0x10000 else end - i
					if useBlockCompression:
						inputChunk = blockDecompressorReader.read(chunkSz)
					else:
						inputChunk = decompressor.read(chunkSz)
					
					if not len(inputChunk):
						break
					
					if not useBlockCompression:
						decompressor.flush()
					if s.cryptoType in (3, 4):
						inputChunk = crypto.encrypt(inputChunk)
					if write:
						f.write(inputChunk)
					bar.update(len(inputChunk))
					hash.update(inputChunk)
					
					i += len(inputChunk)

		hexHash = hash.hexdigest()[0:32]
		if hexHash + '.nca' == newFileName:
			Print.error('[VERIFIED]   {0}'.format(nspf._path))
		else:
			Print.info('[CORRUPTED]  {0}'.format(nspf._path))
			if raiseVerificationException:
				raise Exception("Verification detected hash missmatch")

		
		if write:
			end = f.tell()
			written = (end - start)
			newNsp.resize(newFileName, written)
		
		continue

	if write:
		newNsp.close()
Example #16
0
def load_single_pbc(name):
    arc = open(romfs_path + '/Model/' + name + '_pbc.Nin_NX_NVN.zs',
               'rb').read()
    arc = zstandard.ZstdDecompressor().decompress(arc)
    arc = sarc.SARC(arc)
    return pbc.PBC(arc.get_file_data(name + '.pbc'))
Example #17
0
messages_path = sys.argv[1]
bcsv_path = sys.argv[2]

def fixup(name):
	if name.startswith('\x0e2'):
		name = name[6:]
	name = name.replace('\x0en\x1e\0', '<name>')
	return name

output = {}


# MESSAGES
data = open(messages_path + '/String_EUen.sarc.zs', 'rb').read()
data = zstandard.ZstdDecompressor().decompress(data)
msgArc = sarc.SARC(data)

output['items'] = {}
outfitGroup = {}

for name in sorted(msgArc.list_files()):
	if 'STR_ItemName' in name:
		m = msbt.MSBT()
		m.load(msgArc.get_file_data(name))
		for label, index in m.labels.items():
			if not label.endswith('_pl'):
				item_id = int(label[label.rfind('_') + 1:], 10)
				output['items'][item_id] = fixup(m.strings[index])
	if 'STR_OutfitGroupColor' in name:
		m = msbt.MSBT()
Example #18
0
def import_pbc_arc(filename):
    arc = open(romfs_path + '/Model/' + filename, 'rb').read()
    arc = zstandard.ZstdDecompressor().decompress(arc)
    arc = sarc.SARC(arc)
    for name in arc.list_files():
        tile_pbcs[name] = pbc.PBC(arc.get_file_data(name))
Example #19
0
    def download_and_modify_image():
        # This function downloads and edits the downloaded tar file on the fly.
        # It emits chunked buffers of the editted tar file, as a generator.
        print("Downloading from {}".format(url))
        # get_session() gets us a requests.Session set to retry several times.
        req = get_session().get(url, stream=True)
        req.raise_for_status()

        with zstd.ZstdDecompressor().stream_reader(req.raw) as ifh:

            tarin = tarfile.open(
                mode='r|',
                fileobj=ifh,
                bufsize=zstd.DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE)

            # Stream through each member of the downloaded tar file individually.
            for member in tarin:
                # Non-file members only need a tar header. Emit one.
                if not member.isfile():
                    yield member.tobuf(tarfile.GNU_FORMAT)
                    continue

                # Open stream reader for the member
                reader = tarin.extractfile(member)

                # If member is `repositories`, we parse and possibly rewrite the
                # image tags.
                if member.name == 'repositories':
                    # Read and parse repositories
                    repos = json.loads(reader.read())
                    reader.close()

                    # If there is more than one image or tag, we can't handle it
                    # here.
                    if len(repos.keys()) > 1:
                        raise Exception('file contains more than one image')
                    info['image'] = image = repos.keys()[0]
                    if len(repos[image].keys()) > 1:
                        raise Exception('file contains more than one tag')
                    info['tag'] = tag = repos[image].keys()[0]
                    info['layer'] = layer = repos[image][tag]

                    # Rewrite the repositories file
                    data = json.dumps(
                        {imageName or image: {
                            imageTag or tag: layer
                        }})
                    reader = BytesIO(data)
                    member.size = len(data)

                # Emit the tar header for this member.
                yield member.tobuf(tarfile.GNU_FORMAT)
                # Then emit its content.
                remaining = member.size
                while remaining:
                    length = min(remaining,
                                 zstd.DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE)
                    buf = reader.read(length)
                    remaining -= len(buf)
                    yield buf
                # Pad to fill a 512 bytes block, per tar format.
                remainder = member.size % 512
                if remainder:
                    yield '\0' * (512 - remainder)

                reader.close()
def trigger_missing(server_address: str, out_dir: str = ".") -> None:
    triggered_revisions_path = os.path.join(out_dir, "triggered_revisions.zst")

    url = f"https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/project.relman.code-coverage.{secrets[secrets.APP_CHANNEL]}.cron.latest/artifacts/public/triggered_revisions.zst"  # noqa
    r = requests.head(url, allow_redirects=True)
    if r.status_code != 404:
        utils.download_file(url, triggered_revisions_path)

    try:
        dctx = zstandard.ZstdDecompressor()
        with open(triggered_revisions_path, "rb") as zf:
            with dctx.stream_reader(zf) as reader:
                with io.TextIOWrapper(reader, encoding="ascii") as f:
                    triggered_revisions = set(rev
                                              for rev in f.read().splitlines())
    except FileNotFoundError:
        triggered_revisions = set()

    # Get all mozilla-central revisions from the past year.
    days = 365 if secrets[secrets.APP_CHANNEL] == "production" else 30
    a_year_ago = datetime.utcnow() - timedelta(days=days)
    with hgmo.HGMO(server_address=server_address) as hgmo_server:
        data = hgmo_server.get_pushes(
            startDate=a_year_ago.strftime("%Y-%m-%d"),
            full=False,
            tipsonly=True)

    revisions = [(push_data["changesets"][0], int(push_data["date"]))
                 for push_data in data["pushes"].values()]

    logger.info(f"{len(revisions)} pushes in the past year")

    assert (secrets[secrets.GOOGLE_CLOUD_STORAGE]
            is not None), "Missing GOOGLE_CLOUD_STORAGE secret"
    bucket = get_bucket(secrets[secrets.GOOGLE_CLOUD_STORAGE])

    missing_revisions = []
    for revision, timestamp in revisions:
        # Skip revisions that have already been triggered. If they are still missing,
        # it means there is a problem that is preventing us from ingesting them.
        if revision in triggered_revisions:
            continue

        # If the revision was already ingested, we don't need to trigger ingestion for it again.
        if uploader.gcp_covdir_exists(bucket, "mozilla-central", revision,
                                      "all", "all"):
            triggered_revisions.add(revision)
            continue

        missing_revisions.append((revision, timestamp))

    logger.info(f"{len(missing_revisions)} missing pushes in the past year")

    yesterday = int(datetime.timestamp(datetime.utcnow() - timedelta(days=1)))

    task_group_id = slugId()
    logger.info(f"Triggering tasks in the {task_group_id} group")
    triggered = 0
    for revision, timestamp in reversed(missing_revisions):
        # If it's older than yesterday, we assume the group finished.
        # If it is newer than yesterday, we load the group and check if all tasks in it finished.
        if timestamp > yesterday:
            decision_task_id = taskcluster.get_decision_task(
                "mozilla-central", revision)
            if decision_task_id is None:
                continue

            group = taskcluster.get_task_details(
                decision_task_id)["taskGroupId"]
            if not all(task["status"]["state"] in taskcluster.FINISHED_STATUSES
                       for task in taskcluster.get_tasks_in_group(group)
                       if taskcluster.is_coverage_task(task["task"])):
                continue

        trigger_task(task_group_id, revision)
        triggered_revisions.add(revision)
        triggered += 1
        if triggered == MAXIMUM_TRIGGERS:
            break

    cctx = zstandard.ZstdCompressor(threads=-1)
    with open(triggered_revisions_path, "wb") as zf:
        with cctx.stream_writer(zf) as compressor:
            with io.TextIOWrapper(compressor, encoding="ascii") as f:
                f.write("\n".join(triggered_revisions))
    def test_fileno_file(self):
        with tempfile.TemporaryFile("wb") as tf:
            dctx = zstd.ZstdDecompressor()
            writer = dctx.stream_writer(tf)

            self.assertEqual(writer.fileno(), tf.fileno())
Example #22
0
def _decode_base64_data(
        data_text: str,
        layer_width: int,
        compression: Optional[str] = None) -> objects.TileLayerGrid:
    """Decode base64 data.

    Args:
        data_text: Data to be decoded.
        layer_width: Width of each layer in tiles.
        compression: The type of compression for the data.

    Raises:
        ValueError: If compression type is unsupported.

    Returns:
        objects.TileLayerGrid: Tile grid.
    """
    tile_grid: objects.TileLayerGrid = [[]]

    unencoded_data = base64.b64decode(data_text)
    if compression == "zlib":
        unzipped_data = zlib.decompress(unencoded_data)
    elif compression == "gzip":
        unzipped_data = gzip.decompress(unencoded_data)
    elif compression == "zstd":
        modulename = 'zstandard'
        my_loader = importlib.find_loader(modulename)
        found = my_loader is not None
        if not found:
            raise ValueError(
                "Can't load 'zstd' compressed map without the 'zstandard' "
                "library available. Either install 'zstandard' or go to "
                "Map Properties and change Tile Layer Format to "
                "Base64, Base64 gzip, or Base64.")
        else:
            import zstandard
            dctx = zstandard.ZstdDecompressor()
            unzipped_data = dctx.decompress(unencoded_data)
    elif compression is None:
        unzipped_data = unencoded_data
    else:
        raise ValueError(f"Unsupported compression type: '{compression}'.")

    # Turn bytes into 4-byte integers
    byte_count = 0
    int_count = 0
    int_value = 0
    row_count = 0
    for byte in unzipped_data:
        int_value += byte << (byte_count * 8)
        byte_count += 1
        if not byte_count % 4:
            byte_count = 0
            int_count += 1
            tile_grid[row_count].append(int_value)
            int_value = 0
            if not int_count % layer_width:
                row_count += 1
                tile_grid.append([])

    tile_grid.pop()
    return tile_grid
Example #23
0
def polydata_list_from_json(js, manager=None):
    """Deserialize a Javascript vtk.js PolyData object.

    Decompresses data buffers.
    """
    if js is None:
        return None
    else:
        decompressor = zstd.ZstdDecompressor()

        polydata_list = []
        for json_polydata in js:
            polydata = dict()
            for top_key, top_value in json_polydata.items():
                if isinstance(top_value, dict):
                    nested_value_copy = dict()
                    for nested_key, nested_value in top_value.items():
                        if not nested_key == 'compressedValues':
                            nested_value_copy[nested_key] = nested_value
                    polydata[top_key] = nested_value_copy
                else:
                    polydata[top_key] = top_value

            if 'points' in polydata:
                dtype = _type_to_numpy(polydata['points']['dataType'])
                if six.PY2:
                    asBytes = json_polydata['points'][
                        'compressedValues'].tobytes()
                    valuesBufferArrayCompressed = np.frombuffer(asBytes,
                                                                dtype=np.uint8)
                else:
                    valuesBufferArrayCompressed = np.frombuffer(
                        json_polydata['points']['compressedValues'],
                        dtype=np.uint8)
                numberOfBytes = json_polydata['points']['size'] * np.dtype(
                    dtype).itemsize
                valuesBufferArray = \
                    np.frombuffer(decompressor.decompress(valuesBufferArrayCompressed,
                        numberOfBytes),
                            dtype=dtype)
                valuesBufferArray.shape = (int(
                    json_polydata['points']['size'] / 3), 3)
                polydata['points']['values'] = valuesBufferArray

            for cell_type in ['verts', 'lines', 'polys', 'strips']:
                if cell_type in polydata:
                    dtype = _type_to_numpy(polydata[cell_type]['dataType'])
                    if six.PY2:
                        asBytes = json_polydata[cell_type][
                            'compressedValues'].tobytes()
                        valuesBufferArrayCompressed = np.frombuffer(
                            asBytes, dtype=np.uint8)
                    else:
                        valuesBufferArrayCompressed = np.frombuffer(
                            json_polydata[cell_type]['compressedValues'],
                            dtype=np.uint8)
                    numberOfBytes = json_polydata[cell_type][
                        'size'] * np.dtype(dtype).itemsize
                    valuesBufferArray = \
                        np.frombuffer(decompressor.decompress(valuesBufferArrayCompressed,
                            numberOfBytes),
                                dtype=dtype)
                    valuesBufferArray.shape = (
                        json_polydata[cell_type]['size'], )
                    polydata[cell_type]['values'] = valuesBufferArray

            for data_type in ['pointData', 'cellData']:
                if data_type in polydata:
                    data = json_polydata[data_type]
                    decompressed_data = dict()
                    for nested_key, nested_value in data.items():
                        if not nested_key == 'arrays':
                            decompressed_data[nested_key] = nested_value
                    decompressed_arrays = []
                    for array in json_polydata[data_type]['arrays']:
                        decompressed_array = dict()
                        for nested_key, nested_value in array['data'].items():
                            if not nested_key == 'compressedValues':
                                decompressed_array[nested_key] = nested_value
                        dtype = _type_to_numpy(decompressed_array['dataType'])
                        if six.PY2:
                            asBytes = array['data'][
                                'compressedValues'].tobytes()
                            valuesBufferArrayCompressed = np.frombuffer(
                                asBytes, dtype=np.uint8)
                        else:
                            valuesBufferArrayCompressed = np.frombuffer(
                                array['data']['compressedValues'],
                                dtype=np.uint8)
                        numberOfBytes = decompressed_array['size'] * np.dtype(
                            dtype).itemsize
                        valuesBufferArray = \
                            np.frombuffer(decompressor.decompress(valuesBufferArrayCompressed,
                                numberOfBytes),
                                    dtype=dtype)
                        valuesBufferArray.shape = (
                            decompressed_array['size'], )
                        decompressed_array['values'] = valuesBufferArray
                        decompressed_arrays.append(
                            {'data': decompressed_array})
                    decompressed_data['arrays'] = decompressed_arrays
                    polydata[data_type] = decompressed_data

            polydata_list.append(polydata)
        return polydata_list
Example #24
0
def zstd_decompress(content):
  ctx = zstd.ZstdDecompressor()
  return ctx.decompress(content)
Example #25
0
def zstd_decompress(path):
    dctx = zstandard.ZstdDecompressor()
    with open(f"{path}.zst", "rb") as input_f:
        with open(path, "wb") as output_f:
            dctx.copy_stream(input_f, output_f)
Example #26
0
def download_manifest(
    manifest_path,  # type: Text
    tags_func,  # type: Callable[[], List[Text]]
    url_func,  # type: Callable[[List[Text]], Optional[List[Text]]]
    force=False  # type: bool
):
    # type: (...) -> bool
    if not force and not should_download(manifest_path):
        return False

    tags = tags_func()

    urls = url_func(tags)
    if not urls:
        logger.warning("No generated manifest found")
        return False

    for url in urls:
        logger.info("Downloading manifest from %s" % url)
        try:
            resp = urlopen(url)
        except Exception:
            logger.warning("Downloading pregenerated manifest failed")
            continue

        if resp.code != 200:
            logger.warning(
                "Downloading pregenerated manifest failed; got HTTP status %d"
                % resp.code)
            continue

        if url.endswith(".zst"):
            if not zstandard:
                continue
            try:
                dctx = zstandard.ZstdDecompressor()
                decompressed = dctx.decompress(resp.read())
            except IOError:
                logger.warning("Failed to decompress downloaded file")
                continue
        elif url.endswith(".bz2"):
            try:
                decompressed = bz2.decompress(resp.read())
            except IOError:
                logger.warning("Failed to decompress downloaded file")
                continue
        elif url.endswith(".gz"):
            fileobj = io.BytesIO(resp.read())
            try:
                with gzip.GzipFile(fileobj=fileobj) as gzf:
                    data = read_gzf(gzf)  # type: ignore
                    decompressed = data
            except IOError:
                logger.warning("Failed to decompress downloaded file")
                continue
        else:
            logger.warning("Unknown file extension: %s" % url)
            continue
        break
    else:
        return False

    try:
        with open(manifest_path, "wb") as f:
            f.write(decompressed)
    except Exception:
        logger.warning("Failed to write manifest")
        return False
    logger.info("Manifest downloaded")
    return True
Example #27
0
		
	flags = input[7] & 0x0F
	
	input = input[8:]
	aesKey = unwrapKey(input[0:256], args.privatekeys)
	input = input[256:]
	
	len = int.from_bytes(input[0:8], byteorder='little')
	
	cipher = AES.new(aesKey, AES.MODE_ECB)
	input = cipher.decrypt(input[8:])
	
	if flags == 0x0E: #zlib
		input = zlib.decompress(input[0:len])
	elif flags == 0x0D: #zstd
		dctx = zstd.ZstdDecompressor()
		input = dctx.decompress(input[0:len])
	elif flags == 0x00: #plaintext
		input = input[0:len]
	else:
		raise IOError('invalid flag')
		
	with open(args.output, 'wb') as f:
		f.write(input)
else:
	if args.zlib:
		flag = 0x0E
		print('compressing with zlib')
		buf = zlib.compress(input, 9)
	elif args.zstd:
		flag = 0x0D
Example #28
0
 def __init__(self):
     dctx = zstd.ZstdDecompressor()
     self._decomp = dctx.decompressobj()
     self.eof = False
Example #29
0
    def test_bad_write_size(self):
        dctx = zstd.ZstdDecompressor()

        with self.assertRaisesRegex(ValueError, "write_size must be positive"):
            dctx.decompressobj(write_size=0)
Example #30
0
    def test_simple(self):
        data = zstd.ZstdCompressor(level=1).compress(b'foobar')

        dctx = zstd.ZstdDecompressor()
        dobj = dctx.decompressobj()
        self.assertEqual(dobj.decompress(data), b'foobar')