Ejemplo n.º 1
0
 def test_decompress_trailer(self):
     input_data = b"2099023098234882923049823094823094898239230982349081231290381209380981203981209381238901283098908123109238098123"
     compressed = lz4frame.compress(input_data)
     with self.assertRaisesRegexp(ValueError,
                                  r'^Extra data: 64 trailing bytes'):
         lz4frame.decompress(compressed + b'A' * 64)
     # This API does not support frame concatenation!
     with self.assertRaisesRegexp(ValueError,
                                  r'^Extra data: \d+ trailing bytes'):
         lz4frame.decompress(compressed + compressed)
Ejemplo n.º 2
0
 def test_checksum_failure(self):
     input_data = b"2099023098234882923049823094823094898239230982349081231290381209380981203981209381238901283098908123109238098123"
     compressed = lz4frame.compress(
         input_data, content_checksum=lz4frame.CONTENTCHECKSUM_ENABLED)
     with self.assertRaisesRegexp(
             RuntimeError,
             r'^LZ4F_decompress failed with code: ERROR_contentChecksum_invalid'
     ):
         last = struct.unpack('B', compressed[-1:])[0]
         lz4frame.decompress(compressed[:-1] +
                             struct.pack('B', last ^ 0x42))
Ejemplo n.º 3
0
def test_block_checksum_failure(data):
    compressed = lz4frame.compress(
        data,
        content_checksum=True,
        block_checksum=True,
        return_bytearray=True,
    )
    message = r'^LZ4F_decompress failed with code: ERROR_blockChecksum_invalid$'
    if len(compressed) > 32:
        with pytest.raises(RuntimeError, message=message):
            compressed[18] = compressed[18] ^ 0x42
            lz4frame.decompress(compressed)
Ejemplo n.º 4
0
def test_block_checksum_failure(data):
    compressed = lz4frame.compress(
        data,
        content_checksum=True,
        block_checksum=True,
        return_bytearray=True,
    )
    message = r'^LZ4F_decompress failed with code: ERROR_blockChecksum_invalid$'
    if len(compressed) > 32:
        with pytest.raises(RuntimeError, message=message):
            compressed[18] = compressed[18] ^ 0x42
            lz4frame.decompress(compressed)
Ejemplo n.º 5
0
 def test_compress_without_content_size(self):
     input_data = b"2099023098234882923049823094823094898239230982349081231290381209380981203981209381238901283098908123109238098123"
     compressed = lz4frame.compress(input_data, content_size_header=False)
     frame = lz4frame.get_frame_info(compressed)
     self.assertEqual(frame['contentSize'], 0)
     decompressed = lz4frame.decompress(compressed)
     self.assertEqual(input_data, decompressed)
    def update_network(self, minibatchs):

        indices_all, priorities_all, losses = [], [], []

        with util.Timer("Learner update:"):

            for (indices, weights, samples) in minibatchs:

                samples = [pickle.loads(lz4f.decompress(s)) for s in samples]

                priorities, loss_info = self.update(weights, samples)

                indices_all += indices

                priorities_all += priorities

                losses.append(loss_info)

        current_weights = self.get_weights()

        total_loss = sum([l[0] for l in losses]) / len(losses)
        policy_loss = sum([l[1] for l in losses]) / len(losses)
        value_loss = sum([l[2] for l in losses]) / len(losses)
        reward_loss = sum([l[3] for l in losses]) / len(losses)

        losses_mean = (total_loss, policy_loss, value_loss, reward_loss)

        return (current_weights, indices_all, priorities_all, losses_mean)
Ejemplo n.º 7
0
def test_spark_functionality():
    pyspark = pytest.importorskip("pyspark", minversion="2.4.1")

    import lz4.frame as lz4f
    import cloudpickle as cpkl

    from coffea import hist
    from coffea.processor.spark.tests import check_spark_functionality
    one, two, hists = check_spark_functionality()

    #check that spark produced the correct output dataset
    assert ((one == two) and (one == 3))

    #make sure that the returned histograms are all the same (as they should be in this case)
    assert ((len(hists[0]) == len(hists[1]))
            and (len(hists[1]) == len(hists[2])))

    inflated = []
    for i in range(3):
        inflated.append(cpkl.loads(lz4f.decompress(hists[i])))

    final = inflated.pop()
    for ihist in inflated:
        final.add(ihist)

    #make sure the accumulator is working right after being blobbed through spark
    assert (final['cutflow']['dummy'] == 3)
Ejemplo n.º 8
0
def nope_test_spark_functionality():
    try:
        import pyspark
    except ImportError:
        warnings.warn('pyspark not installed, skipping tests')
        return
    except Exception as e:
        warnings.warn('other error when trying to import pyspark!')
        raise e

    import lz4.frame as lz4f
    import cloudpickle as cpkl

    from fnal_column_analysis_tools import hist
    from fnal_column_analysis_tools.processor.spark.tests import check_spark_functionality
    one, two, hists = check_spark_functionality()

    #check that spark produced the correct output dataset
    assert ((one == two) and (one == 3))

    #make sure that the returned histograms are all the same (as they should be in this case)
    assert ((len(hists[0]) == len(hists[1]))
            and (len(hists[1]) == len(hists[2])))

    inflated = []
    for i in range(3):
        inflated.append(cpkl.loads(lz4f.decompress(hists[i])))

    final = inflated.pop()
    for ihist in inflated:
        final.add(ihist)

    #make sure the accumulator is working right after being blobbed through spark
    assert (final['cutflow']['dummy'] == 3)
Ejemplo n.º 9
0
    def __call__(self,
                 dfk,
                 items,
                 processor_instance,
                 output,
                 unit='items',
                 desc='Processing',
                 timeout=None,
                 flatten=True):
        procstr = lz4f.compress(cpkl.dumps(processor_instance))

        nitems = len(items)
        ftr_to_item = set()
        for dataset, fn, treename, chunksize, index in items:
            if dataset not in self._counts:
                self._counts[dataset] = 0
            ftr_to_item.add(
                coffea_pyapp(dataset,
                             fn,
                             treename,
                             chunksize,
                             index,
                             procstr,
                             timeout=timeout,
                             flatten=flatten))

        for ftr in tqdm(as_completed(ftr_to_item),
                        total=nitems,
                        unit='items',
                        desc='Processing'):
            blob, nentries, dataset = ftr.result()
            self._counts[dataset] += nentries
            output.add(pkl.loads(lz4f.decompress(blob)))
Ejemplo n.º 10
0
def _work_function(item, processor_instance, flatten=False, savemetrics=False, mmap=False):
    if processor_instance == 'heavy':
        item, processor_instance = item
    if not isinstance(processor_instance, ProcessorABC):
        processor_instance = cloudpickle.loads(lz4f.decompress(processor_instance))
    if mmap:
        localsource = {}
    else:
        opts = dict(uproot.FileSource.defaults)
        opts.update({'parallel': None})

        def localsource(path):
            return uproot.FileSource(path, **opts)

    file = uproot.open(item.filename, localsource=localsource)
    tree = file[item.treename]
    df = LazyDataFrame(tree, item.chunksize, item.index, flatten=flatten)
    df['dataset'] = item.dataset
    tic = time.time()
    out = processor_instance.process(df, locals())
    toc = time.time()
    metrics = dict_accumulator()
    if savemetrics:
        if isinstance(file.source, uproot.source.xrootd.XRootDSource):
            metrics['bytesread'] = value_accumulator(int, file.source.bytesread)
            metrics['dataservers'] = set_accumulator({file.source._source.get_property('DataServer')})
        metrics['columns'] = set_accumulator(df.materialized)
        metrics['entries'] = value_accumulator(int, df.size)
        metrics['processtime'] = value_accumulator(float, toc - tic)
    wrapped_out = dict_accumulator({'out': out, 'metrics': metrics})
    file.source.close()
    return wrapped_out
Ejemplo n.º 11
0
    def extract_yaml_reports(filename):
        """Fetch compressed tarball from S3 private repo
        yield any YAML report
        """
        # Local caching - disabled
        # if os.path.isfile(filename):
        #     with open(filename, "rb") as f:
        #         fileobj=BytesIO(f.read())
        #         fileobj.seek(0)
        # else:
        #     obj = s3.Object("ooni-data-private", "canned/{}".format(filename))
        #     fileobj = BytesIO(lz4frame.decompress(obj.get()["Body"].read()))
        #     with open(filename, "wb") as f:
        #         fileobj.seek(0)
        #         f.write(fileobj.read())
        #     fileobj.seek(0)
        obj = s3.Object("ooni-data-private", "canned/{}".format(filename))
        fileobj = BytesIO(lz4frame.decompress(obj.get()["Body"].read()))

        tf = tarfile.TarFile(fileobj=fileobj)
        for m in tf.getmembers():
            if m.name.endswith(".yaml"):
                f = tf.extractfile(m)
                canned_yaml = BytesIO(f.read())
                yield (m.name, canned_yaml)
Ejemplo n.º 12
0
def reduce_histos_raw(df, processor_instance, lz4_clevel):
    histos = df['histos']
    mask = (histos.str.len() > 0)
    outhist = processor_instance.accumulator.identity()
    for line in histos[mask]:
        outhist.add(pkl.loads(lz4f.decompress(line)))
    return pd.DataFrame(data={'histos': np.array([lz4f.compress(pkl.dumps(outhist), compression_level=lz4_clevel)], dtype='O')})
Ejemplo n.º 13
0
    def generator_data(self,
                       batch_size=20,
                       args={
                           'type': "train",
                           'id': {
                               "$lt": 500
                           }
                       }):

        pc = self.db.DataSet.find(args)
        flist = pc.distinct('imageData')
        fldict = {}
        for f in flist:
            s = self.datafs.get(f).read()
            s2 = decompress(s)
            fldict[f] = pickle.loads(s2)

        print pc.count()

        for i in range(0, pc.count(), batch_size):

            pc.rewind()

            rt = [(fldict[x['imageData']][x['id']].reshape(1, 784), x['label'])
                  for x in pc[i:i + batch_size]]

            dl = zip(*rt)
            d = np.concatenate(dl[0])
            l = np.array(dl[1])

            yield d, l
Ejemplo n.º 14
0
    def find_one_params(self, args={}, sort=None, lz4_decomp=False):
        """ Find one parameter from MongoDB Buckets.

        Parameters
        ----------
        args : dictionary, find items.

        Returns
        --------
        params : the parameters, return False if nothing found.
        f_id : the Buckets ID of the parameters, return False if nothing found.
        """
        d = self.db.Params.find_one(filter=args, sort=sort)

        if d is not None:
            f_id = d['f_id']
        else:
            print(("[TensorDB] Cannot find: {}".format(args)))
            return False, False

        st = time.time()
        d = self.paramsfs.get(f_id).read()
        # print('get time', time.time()-st)

        if lz4_decomp:
            # s = time.time()
            d = decompress(d)
            # print('decomp time', time.time()-s)

        # s = time.time()
        params = self.__deserialization(d)
        # print('deseri time', time.time()-s)

        print(("[TensorDB] Find one params SUCCESS, {} took: {}s".format(args, round(time.time()-st, 2))))
        return params, f_id
Ejemplo n.º 15
0
def replace_masks(in_db, out_db, synonyms_file):
    synonyms_dict = get_synonyms_dict(synonyms_file)
    tok = BertTokenizer.from_pretrained('bert-base-cased')

    new_set = lmdb.open(out_db, readonly=False, create=True, map_size=10e9)
    new_set_c = new_set.begin(write=True)

    env = lmdb.open(in_db, readonly=True, create=False, map_size=4 * 1024**4)
    txn = env.begin()
    cursor = txn.cursor()

    tokenized_queries = {}

    id2len_dict = {}

    for key, value in tqdm(cursor):
        q_id = key.decode()
        q = msgpack.loads(decompress(value))

        query_tokens, query_ids = replace_masks_in_q(q_id, q['input_ids'],
                                                     synonyms_dict, tok)
        q['toked_question'] = query_tokens
        q['input_ids'] = query_ids
        tokenized_queries[q_id] = q

        id2len_dict[q_id] = len(query_ids)
        new_q = compress(msgpack.dumps(q))
        new_set_c.put(key, new_q)

    print("committing changes")
    new_set_c.commit()
    with open('{}/id2len.json'.format(out_db), 'w') as outfile:
        json.dump(id2len_dict, outfile)
def agg_histos(df):
    global processor_instance, lz4_clevel
    goodlines = df[df.str.len() > 0]
    outhist = processor_instance.accumulator.identity()
    for line in goodlines:
        outhist.add(cpkl.loads(lz4f.decompress(line)))
    return lz4f.compress(cpkl.dumps(outhist), compression_level=lz4_clevel)
Ejemplo n.º 17
0
 def decompress_segments(minibatch):
     inidices, weights, compressed_segments = minibatch
     segments = [
         pickle.loads(lz4f.decompress(compressed_seg))
         for compressed_seg in compressed_segments
     ]
     return (inidices, weights, segments)
Ejemplo n.º 18
0
def test_roundtrip_1(
        data,
        block_size,
        block_linked,
        content_checksum,
        block_checksum,
        compression_level,
        store_size):

    compressed = lz4frame.compress(
        data,
        store_size=store_size,
        compression_level=compression_level,
        block_size=block_size,
        block_linked=block_linked,
        content_checksum=content_checksum,
        block_checksum=block_checksum,
    )

    get_frame_info_check(
        compressed,
        len(data),
        store_size,
        block_size,
        block_linked,
        content_checksum,
        block_checksum,
    )
    decompressed, bytes_read = lz4frame.decompress(
        compressed, return_bytes_read=True)
    assert bytes_read == len(compressed)
    assert decompressed == data
Ejemplo n.º 19
0
    def find_all_params(self, args={}, lz4_decomp=False):
        """ Find all parameter from MongoDB Buckets

        Parameters
        ----------
        args : dictionary, find items

        Returns
        --------
        params : the parameters, return False if nothing found.

        """
        st = time.time()
        pc = self.db.Params.find(args)

        if pc is not None:
            f_id_list = pc.distinct('f_id')
            params = []
            for f_id in f_id_list:  # you may have multiple Buckets files
                tmp = self.paramsfs.get(f_id).read()
                if lz4_decomp:
                    tmp = decompress(tmp)
                params.append(self.__deserialization(tmp))
        else:
            print("[TensorDB] Cannot find: {}".format(args))
            return False

        print("[TensorDB] Find all params SUCCESS, took: {}s".format(
            round(time.time() - st, 2)))
        return params
Ejemplo n.º 20
0
def test_parsl_funcs():
    parsl = pytest.importorskip("parsl", minversion="0.7.2")

    import os.path as osp
    from coffea.processor.parsl.detail import derive_chunks

    filename = osp.abspath('tests/samples/nano_dy.root')
    dataset = 'Z+Jets'
    treename = 'Events'
    chunksize = 20
    ds, tn, test = derive_chunks.func(filename, treename, chunksize, dataset)

    assert (dataset == ds)
    assert (treename == tn)
    assert ('nano_dy.root' in test[0][0])
    assert (test[0][1] == 20)
    assert (test[0][2] == 0)

    from coffea.processor.parsl.parsl_executor import coffea_pyapp
    from coffea.processor.test_items import NanoTestProcessor
    import pickle as pkl
    import cloudpickle as cpkl
    import lz4.frame as lz4f

    procpkl = lz4f.compress(cpkl.dumps(NanoTestProcessor()))

    out = coffea_pyapp.func('ZJets', filename, treename, chunksize, 0, procpkl)

    hists = pkl.loads(lz4f.decompress(out[0]))
    assert (hists['cutflow']['ZJets_pt'] == 4)
    assert (hists['cutflow']['ZJets_mass'] == 1)
    assert (out[1] == 10)
    assert (out[2] == 'ZJets')
Ejemplo n.º 21
0
 def test_LZ4FrameCompressor(self):
     input_data = b"2099023098234882923049823094823094898239230982349081231290381209380981203981209381238901283098908123109238098123"
     with lz4frame.LZ4FrameCompressor() as compressor:
         compressed = compressor.compress(input_data)
         compressed += compressor.flush()
     decompressed = lz4frame.decompress(compressed)
     self.assertEqual(input_data, decompressed)
Ejemplo n.º 22
0
    def __call__(self,
                 spark,
                 dfslist,
                 theprocessor,
                 output,
                 thread_workers,
                 status=True,
                 unit='datasets',
                 desc='Processing'):
        # processor needs to be a global
        global processor_instance, coffea_udf
        processor_instance = theprocessor
        # get columns from processor
        columns = processor_instance.columns
        cols_w_ds = ['dataset'] + columns
        # make our udf
        tmpl = self._env.get_template(self._template_name)
        render = tmpl.render(cols=columns)
        exec(render)

        # cache the input datasets if it's not already done
        if self._cacheddfs is None:
            self._cacheddfs = {}
            self._counts = {}
            # go through each dataset and thin down to the columns we want
            for ds, (df, counts) in dfslist.items():
                self._cacheddfs[ds] = df.select(*cols_w_ds).cache()
                self._counts[ds] = counts

        def spex_accumulator(total, result):
            ds, df = result
            total[ds] = df

        with ThreadPoolExecutor(max_workers=thread_workers) as executor:
            futures = set()
            for ds, df in self._cacheddfs.items():
                futures.add(
                    executor.submit(self._launch_analysis, ds, df, coffea_udf,
                                    cols_w_ds))
            # wait for the spark jobs to come in
            self._rawresults = {}
            futures_handler(futures,
                            self._rawresults,
                            status,
                            unit,
                            desc,
                            futures_accumulator=spex_accumulator)

        for ds, bitstream in self._rawresults.items():
            if bitstream is None:
                raise Exception(
                    'No pandas dataframe returns from spark in dataset: %s, something went wrong!'
                    % ds)
            if bitstream.empty:
                raise Exception(
                    'The histogram list returned from spark is empty in dataset: %s, something went wrong!'
                    % ds)
            bits = bitstream[bitstream.columns[0]][0]
            output.add(pkl.loads(lz4f.decompress(bits)))
Ejemplo n.º 23
0
    def test_lz4(self):
        data = self._data
        compress = verify_and_get_compress_func("lz4")
        self.assertIsNotNone(compress)

        import lz4.frame as lz4

        self.assertEqual(data, lz4.decompress(compress(data)))
Ejemplo n.º 24
0
 def calc_unpacked_data(self, byte_data):
     if lz4 is None:
         raise errors.InvalidAlgorithm("lz4 module needed for .lz4 support")
     try:
         unpacked = lz4.decompress(bytes(byte_data))
     except RuntimeError as e:
         raise errors.InvalidAlgorithm(e)
     return unpacked
Ejemplo n.º 25
0
def test_decompress_return_type_2():
    c = lz4frame.compress(b'', return_bytearray=False)
    r = lz4frame.decompress(
        c,
        return_bytearray=True,
        return_bytes_read=False
    )
    assert isinstance(r, bytearray)
Ejemplo n.º 26
0
def _maybe_decompress(item):
    if isinstance(item, bytes):
        item = cloudpickle.loads(lz4f.decompress(item))
    if isinstance(item, AccumulatorABC):
        return item
    raise ValueError(
        "Executors can only reduce accumulators or LZ4-compressed pickled accumulators"
    )
Ejemplo n.º 27
0
def loads(s):
    try:
        return msgpack.unpackb(
            decompress(s), ext_hook=decode_ext, encoding='utf-8')
    except Exception:
        # we queue work occassionally from lambdas or other systems not using
        # the worker class
        return job_default_load(s)
Ejemplo n.º 28
0
def agg_histos_raw(series, processor_instance, lz4_clevel):
    goodlines = series[series.str.len() > 0]
    if goodlines.size == 1:  # short-circuit trivial aggregations
        return goodlines[0]
    outhist = processor_instance.accumulator.identity()
    for line in goodlines:
        outhist.add(pkl.loads(lz4f.decompress(line)))
    return lz4f.compress(pkl.dumps(outhist), compression_level=lz4_clevel)
Ejemplo n.º 29
0
def dir_delta(sig, new_path):
    with open(RDIFF_SIG_FILENAME, "wb") as f:
        f.write(decompress(sig))
    p = subprocess.run(["rdiffdir", "delta", RDIFF_SIG_FILENAME, new_path, "-"],
                       stdout=subprocess.PIPE,
                       input=sig)
    os.remove(RDIFF_SIG_FILENAME)
    return compress(p.stdout)
Ejemplo n.º 30
0
 def test_decompress_truncated(self):
     input_data = b"2099023098234882923049823094823094898239230982349081231290381209380981203981209381238901283098908123109238098123"
     for chksum in (lz4frame.CONTENTCHECKSUM_DISABLED,
                    lz4frame.CONTENTCHECKSUM_ENABLED):
         for conlen in (0, len(input_data)):
             context = lz4frame.create_compression_context()
             compressed = lz4frame.compress_begin(context,
                                                  content_checksum=chksum,
                                                  source_size=conlen)
             compressed += lz4frame.compress_update(context, input_data)
             compressed += lz4frame.compress_end(context)
             for i in range(len(compressed)):
                 with self.assertRaisesRegexp(
                         RuntimeError,
                         r'^(LZ4F_getFrameInfo failed with code: ERROR_frameHeader_incomplete|LZ4F_freeDecompressionContext reported unclean decompressor state \(truncated frame\?\): \d+)$'
                 ):
                     lz4frame.decompress(compressed[:i])
Ejemplo n.º 31
0
def coffea_pyapp(dataset,
                 fn,
                 treename,
                 chunksize,
                 index,
                 procstr,
                 timeout=None,
                 flatten=True):
    import uproot
    import cloudpickle as cpkl
    import pickle as pkl
    import lz4.frame as lz4f
    from coffea import hist, processor
    from coffea.processor.accumulator import value_accumulator

    uproot.XRootDSource.defaults["parallel"] = False

    lz4_clevel = 1

    # instrument xrootd source
    if not hasattr(uproot.source.xrootd.XRootDSource, '_read_real'):

        def _read(self, chunkindex):
            self.bytesread = getattr(self, 'bytesread', 0) + self._chunkbytes
            return self._read_real(chunkindex)

        uproot.source.xrootd.XRootDSource._read_real = uproot.source.xrootd.XRootDSource._read
        uproot.source.xrootd.XRootDSource._read = _read

    processor_instance = cpkl.loads(lz4f.decompress(procstr))

    afile = uproot.open(fn)

    tree = None
    if isinstance(treename, str):
        tree = afile[treename]
    elif isinstance(treename, Sequence):
        for name in reversed(treename):
            if name in afile:
                tree = afile[name]
    else:
        raise Exception('treename must be a str or Sequence but is a %s!' %
                        repr(type(treename)))

    if tree is None:
        raise Exception('No tree found, out of possible tree names: %s' %
                        repr(treename))

    df = processor.LazyDataFrame(tree, chunksize, index, flatten=flatten)
    df['dataset'] = dataset

    vals = processor_instance.process(df)
    if isinstance(afile.source, uproot.source.xrootd.XRootDSource):
        vals['_bytesread'] = value_accumulator(int) + afile.source.bytesread
    valsblob = lz4f.compress(pkl.dumps(vals), compression_level=lz4_clevel)

    return valsblob, df.size, dataset
Ejemplo n.º 32
0
def loads(s):
    try:
        return msgpack.unpackb(decompress(s),
                               ext_hook=decode_ext,
                               encoding='utf-8')
    except Exception:
        # we queue work occassionally from lambdas or other systems not using
        # the worker class
        return job_default_load(s)
Ejemplo n.º 33
0
def unpack_data(dump, w=None, h=None):
    if dump is None:
        return None
    decompressed = zs.decompress(dump)
    fast_decompressed = lz.decompress(decompressed)
    if fast_decompressed[:5] == b'Chain':
        return pickle.loads(fast_decompressed[5:])
    else:
        return np.reshape(np.frombuffer(fast_decompressed), newshape=(h, w))
Ejemplo n.º 34
0
def test_decompress_return_type_3():
    c = lz4frame.compress(b'', return_bytearray=False)
    r = lz4frame.decompress(
        c,
        return_bytearray=False,
        return_bytes_read=True
    )
    assert isinstance(r, tuple)
    assert isinstance(r[0], bytes)
    assert isinstance(r[1], int)
Ejemplo n.º 35
0
def test_roundtrip_multiframe_1(data):
    nframes = 4

    compressed = b''
    for _ in range(nframes):
        compressed += lz4frame.compress(data)

    decompressed = b''
    for _ in range(nframes):
        decompressed += lz4frame.decompress(compressed)

    assert len(decompressed) == nframes * len(data)
    assert data * nframes == decompressed
Ejemplo n.º 36
0
def test_roundtrip_multiframe_2(data):
    nframes = 4

    compressed = b''
    ctx = lz4frame.create_compression_context()
    for _ in range(nframes):
        compressed += lz4frame.compress_begin(ctx)
        compressed += lz4frame.compress_chunk(ctx, data)
        compressed += lz4frame.compress_flush(ctx)

    decompressed = b''
    for _ in range(nframes):
        decompressed += lz4frame.decompress(compressed)

    assert len(decompressed) == nframes * len(data)
    assert data * nframes == decompressed
Ejemplo n.º 37
0
def test_roundtrip_2(data,
                     block_size,
                     block_linked,
                     content_checksum,
                     block_checksum,
                     compression_level,
                     auto_flush,
                     store_size):

    c_context = lz4frame.create_compression_context()

    kwargs = {}
    kwargs['compression_level'] = compression_level
    kwargs['block_size'] = block_size
    kwargs['block_linked'] = block_linked
    kwargs['content_checksum'] = content_checksum
    kwargs['block_checksum'] = block_checksum
    kwargs['auto_flush'] = auto_flush
    if store_size is True:
        kwargs['source_size'] = len(data)

    compressed = lz4frame.compress_begin(
        c_context,
        **kwargs
    )
    compressed += lz4frame.compress_chunk(
        c_context,
        data
    )
    compressed += lz4frame.compress_flush(c_context)
    get_frame_info_check(
        compressed,
        len(data),
        store_size,
        block_size,
        block_linked,
        content_checksum,
        block_checksum,
    )
    decompressed, bytes_read = lz4frame.decompress(
        compressed, return_bytes_read=True)
    assert bytes_read == len(compressed)
    assert decompressed == data
Ejemplo n.º 38
0
def test_decompress_truncated(data):
    compressed = lz4frame.compress(data)

    message = r'^LZ4F_getFrameInfo failed with code: ERROR_frameHeader_incomplete'
    with pytest.raises(RuntimeError, message=message):
        lz4frame.decompress(compressed[:6])

    for i in range(16, len(compressed) - 1, 5):  # 15 is the max size of the header
        message = r'^Frame incomplete. LZ4F_decompress returned: {0}'.format(
            len(compressed) - i)
        try:
            lz4frame.decompress(compressed[:i])
        except RuntimeError as r:
            print(r)
        with pytest.raises(RuntimeError, message=message):
            lz4frame.decompress(compressed[:i])
Ejemplo n.º 39
0
def stream_datum(atclv_root, bucket, take_file=None):
    with gzip.GzipFile(os.path.join(atclv_root, bucket, autoclaving.INDEX_FNAME), 'r') as indexfd:
        filefd = None
        dociter = autoclaving.stream_json_blobs(indexfd)
        for _, doc in dociter:
            doc = ujson.loads(doc)
            t = doc['type']
            if t == 'datum':
                # {"orig_sha1": "q7…I=", "text_off": 156846, "text_size": 58327, "type": "datum"}
                intra_off = doc['text_off'] - text_off
                datum = blob[intra_off:intra_off+doc['text_size']]
                assert intra_off >= 0 and len(datum) == doc['text_size']
                datum = ujson.loads(datum)
                doc['frame_off'] = frame_off
                doc['frame_size'] = frame_size
                doc['intra_off'] = intra_off
                doc['intra_size'] = doc['text_size']
                doc['datum'] = datum
                yield DATUM, doc
                del intra_off, datum

            elif t == 'frame':
                # {"file_off": 0, "file_size": 162864, "text_off": 0, "text_size": 362462, … }
                frame_off, frame_size = doc['file_off'], doc['file_size']
                assert filefd.tell() == frame_off
                blob = filefd.read(frame_size)
                assert len(blob) == frame_size
                blob = lz4frame.decompress(blob)
                assert len(blob) == doc['text_size']
                text_off = doc['text_off']

            elif t == '/frame':
                del frame_off, frame_size, text_off, blob

            elif t == 'report':
                # {"orig_sha1": "HO…U=",
                #  "src_size": 104006450,
                #  "textname": "2017-01-01/20161231T000030Z-US-AS…-0.2.0-probe.json", …}
                yield REPORT_START, doc

            elif t == '/report':
                # {"info": "<class '__main__.TruncatedReportError'>",
                #  "src_cutoff": 49484700, … }
                yield REPORT_END, doc

            elif t == 'file':
                # {"filename": "2017-01-01/20161231T000030Z-US-AS…-0.2.0-probe.json.lz4", …}
                filename = doc['filename']
                assert filename.startswith(bucket)
                if take_file is None or take_file(filename):
                    filefd = open(os.path.join(atclv_root, filename), 'rb')
                    del filename
                    yield FILE_START, doc
                else:
                    for _, skipdoc in dociter:
                        if '/file"' in skipdoc and ujson.loads(skipdoc)['type'] == '/file':
                            break
                    del filename, skipdoc

            elif t == '/file':
                # {"file_crc32": -156566611, "file_sha1": "q/…8=", "file_size": 18132131, …}
                assert filefd.tell() == doc['file_size']
                filefd.close()
                filefd = None
                yield FILE_END, doc

            elif t == 'badblob':
                # {"orig_sha1": "RXQFwOtpKtS0KicYi8JnWeQYYBw=",
                #  "src_off": 99257, "src_size": 238,
                #  "info": "<class 'yaml.constructor.ConstructorError'>", …}
                yield BADBLOB, doc

            else:
                raise RuntimeError('Unknown record type', t)
        if filefd is not None:
            raise RuntimeError('Truncated autoclaved index', atclv_root, bucket)
Ejemplo n.º 40
0
def test_content_checksum_failure(data):
    compressed = lz4frame.compress(data, content_checksum=True)
    message = r'^LZ4F_decompress failed with code: ERROR_contentChecksum_invalid$'
    with pytest.raises(RuntimeError, message=message):
        last = struct.unpack('B', compressed[-1:])[0]
        lz4frame.decompress(compressed[:-1] + struct.pack('B', last ^ 0x42))