Example #1
0
    def compute(self, split):
        buffers = [list() for i in self.fields]
        remain_size = STRIPE_DATA_SIZE
        path = os.path.join(self.path, '%04d.dt' % split.index)
        indices = dict((i, AdaptiveIndex()) for i in self.indices)

        def write_stripe(f, compressed, header, padding=True):
            h = compress(marshal.dumps(header))
            assert len(h) < STRIPE_HEADER_SIZE
            f.write(struct.pack('I', len(h)))
            f.write(h)
            padding_size = STRIPE_SIZE - len(h) - 4
            for c in compressed:
                f.write(c)
                padding_size -= len(c)

            if padding:
                f.write('\0' * padding_size)

        with atomic_file(path) as f:
            stripe_id = 0
            for it in chain(self.prev.iterator(sp) for sp in split.splits):
                row = it[:len(self.fields)]
                size = len(marshal.dumps(tuple(row)))
                if size > STRIPE_DATA_SIZE:
                    raise RuntimeError('Row too big')

                if size > remain_size:
                    compressed = [compress(marshal.dumps(tuple(b))) for b in buffers]
                    _sizes = tuple(map(len, compressed))
                    _remain_size = STRIPE_DATA_SIZE - sum(_sizes)
                    if size > _remain_size:
                        write_stripe(f, compressed, _sizes)
                        buffers = [list() for i in self.fields]
                        remain_size = STRIPE_DATA_SIZE
                        stripe_id += 1
                    else:
                        remain_size = _remain_size

                remain_size -= size
                for i, value in enumerate(row):
                    buffers[i].append(value)
                    field = self.fields[i]
                    if field in self.indices:
                        indices[field].add(value, stripe_id)

            if any(buffers):
                compressed = [compress(marshal.dumps(tuple(b))) for b in buffers]
                _sizes = tuple(map(len, compressed))
                write_stripe(f, compressed, _sizes, False)

            footer_indices = zlib.compress(cPickle.dumps(indices, -1))
            footer_fields = compress(marshal.dumps(self.fields))
            f.write(footer_indices)
            f.write(footer_fields)
            f.write(struct.pack('II', len(footer_fields), len(footer_indices)))

        yield path
Example #2
0
def compress_array(str_list):
    """
    Compress an array of strings

    By default LZ4 mode is standard in interactive mode,
    and high compresion in applications/scripts
    """
    if not ENABLE_PARALLEL:
        return [lz4.compress(s) for s in str_list]

    # Less than 50 chunks its quicker to compress sequentially..
    if len(str_list) > LZ4_N_PARALLEL:
        return clz4.compressarr(str_list)
    else:
        return [clz4.compress(s) for s in str_list]
Example #3
0
def compress_array(str_list):
    """
    Compress an array of strings

    By default LZ4 mode is standard in interactive mode,
    and high compresion in applications/scripts
    """
    if not ENABLE_PARALLEL:
        return [lz4.compress(s) for s in str_list]

    # Less than 50 chunks its quicker to compress sequentially..
    if len(str_list) > LZ4_N_PARALLEL:
        return clz4.compressarr(str_list)
    else:
        return [clz4.compress(s) for s in str_list]
Example #4
0
 def _put(self, item):
     hash_key = item.__hash__().to_bytes(length=20,
                                         byteorder=sys.byteorder,
                                         signed=True)
     data = compress(pickle.dumps(
         item, protocol=4)) if self.compression else pickle.dumps(
             item, protocol=4)
     with self._sem:
         try:
             with self._lmdb.begin(write=True) as txn:
                 if not txn.replace(hash_key, data, db=self._hashes_db):
                     key = next(self._idx).to_bytes(length=511,
                                                    byteorder='big',
                                                    signed=False)
                     self.logger.debug(
                         "Queuing new task with SERIAL {sn}".format(
                             sn=int.from_bytes(
                                 key, byteorder='big', signed=False)))
                     txn.put(key, hash_key, append=True, db=self._queue_db)
                 else:
                     self.logger.debug("Updating already queued task")
         except lmdb.MapFullError:
             self.logger.critical(
                 "Database file {path} reached maximum size!".format(
                     path=self.path))
             raise Full()
Example #5
0
    def scanSubvolume(self, subvolume):
        hex = hashlib.sha1(subvolume).hexdigest()
        file = self.folder + "/" + hex + ".lz4"
        completed = self.folder + "/" + hex  + ".complete"

        print "scaning: " + subvolume + " (" + hex + ")"

        if os.path.exists(completed):
            return

        p = Popen('btrfs subvolume find-new ' + subvolume + ' 0', shell=True, stdout=PIPE, stderr=STDOUT)
        list = []

        for line in p.stdout.readlines():
            match = self.matchAllocation.match(line)

            if match is None:
                continue

            list.append(match.group(0))

        with open(file, "w") as text_file:
            text_file.write(lz4.compress("\n".join(list)))

        open(completed, 'a').close()
    def open_db(self):
        self.terms_ldb = leveldb.LevelDB(self.terms_fl)
        self.docs_ldb = leveldb.LevelDB(self.docs_fl)

        self.doc_buffer_size = 0
        self.term_buffer_size = 0

        #self.doc_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype="S%d" % self.max_doc_size)
        self.doc_flush_buffer = [None] * self.max_doc_flush_buffer_size
        self.term_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype="S%d" % self.max_term_size)
        self.doc_id_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype=np.int64)
        self.term_id_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype=np.int64)

        if self.compression == COMPRESSION.NONE:
            self.compress = lambda string: string
            self.decompress = lambda string: string
        elif self.compression == COMPRESSION.ZLIB:
            import zlib
            self.compress = lambda string: zlib.compress(string, self.compression_level)
            self.decompress = lambda string: zlib.decompress(string)
        elif self.compression == COMPRESSION.LZMA:
            import backports.lzma as lzma
            self.compress = lambda string: lzma.compress(bytearray(string), format=lzma.FORMAT_RAW)
            self.decompress = lambda data: lzma.decompress(data, format=lzma.FORMAT_RAW)
        elif self.compression == COMPRESSION.LZ4R:
            import lz4
            self.compress = lambda string: lz4.compress(string)
            self.decompress = lambda string: lz4.decompress(string)
        elif self.compression == COMPRESSION.LZ4H:
            import lz4
            self.compress = lambda string: lz4.compressHC(string)
            self.decompress = lambda string: lz4.decompress(string)
        else:
            raise Exception("Wrong compression type %r" % self.compression)
Example #7
0
 def append(self, eventId, data):
     try:
         with self._sem:
             with self.lmdb.begin(write=True) as txn:
                 eventId = eventId.encode('utf-8')
                 data = compress(json.dumps(data))
                 mtime = int(time.time() * 1000).to_bytes(
                     length=6, byteorder=sys.byteorder, signed=False)
                 delta_idx = next(self.delta_idx).to_bytes(length=511,
                                                           byteorder='big',
                                                           signed=False)
                 index_db = self.lmdb.open_db(key=self.index_name,
                                              txn=txn,
                                              dupsort=True)
                 mtimes_db = self.lmdb.open_db(key=self.mtimes_name,
                                               txn=txn)
                 deltas_db = self.lmdb.open_db(key=self.deltas_name,
                                               txn=txn)
                 txn.put(delta_idx, data, append=True, db=deltas_db)
                 txn.put(eventId, delta_idx, dupdata=True, db=index_db)
                 txn.put(eventId, mtime, overwrite=True, db=mtimes_db)
     except lmdb.MapFullError:
         raise DeltaStoreFull(
             "Database file at {path} has reached its maximum size!".format(
                 path=self.db_path))
Example #8
0
def zip_compress(plain, level=9):
    if not USE_LZ4:
        compressed = zlib.compress(plain, level)
        return compressed[2:]
    else:
        compressed = lz4.compress(plain) if level < 9 else lz4.compressHC(plain)
        return compressed[4:]
Example #9
0
def compress(file):
	data = open(file,"r+b")
        if data.name.endswith('.tar'):
                compressed_data = open(data.name + ".lz4","w")
                compressed_data.write(lz4.compress(data.read()))
	else:
		print "The file type is not the expected."
Example #10
0
def do_test_rountrip_method(compress, i_data, c_data=None):
    from lz4 import LZ4_uncompress                      #@UnresolvedImport
    c = compress(i_data)
    if c_data is not None:
        assert c_data==c, "expected compressed data to look like %s, but got %s" % (hl(c_data), hl(c))
    d = LZ4_uncompress(c)
    assert d==i_data, "expected decompressed data to look like original %s, but got %s" % (hl(i_data), hl(d))
Example #11
0
    def run(self, cb_upload, cb_hash):
        if not self._blocksize:
            raise Exception("Blocksize is not defined")

        block_cnt = 0
        block_new_cnt = 0

        bs = self._block_size * 1024 * 1024
        with open(self._device, "rb") as f:
            while True:
                #Read block and check for EOF
                block = f.read(bs)
                if not block:
                    self.log.info("Input EOF reached")
                    break
                block_cnt += 1

                #Create hash
                h = xxhash.xxh64(block).hexdigest()

                #Hash doesnt exist. Upload!
                if h not in self.blocks:
                    block_new_cnt += 1
                    if self._compression:
                        block = lz4.compress(block)
                    cb_upload(h, block)

                #Add hash to sync table
                cb_hash(h)

        self.log.info(
            "Dedup::run() completed. Uploaded {} new blocks, {} blocks in total"
            .format(block_new_cnt, block_cnt))
Example #12
0
def zip_compress(plain, level=9):
	if not USE_LZ4:
		compressed = zlib.compress(plain, level)
		return compressed[2:]
	else:		
		compressed = lz4.compress(plain) if level < 9 else lz4.compressHC(plain)
		return compressed[4:]
Example #13
0
    def send(self, message):
        version = (0 & 0xf) << 28
        msg_id = message.msg_id

        if not msg_id:
            msg_id = next(self.msg_ids)
            message.msg_ig = msg_id

        msg_id = (msg_id & 0xfff) << 16
        msg_type = (message._MESSAGE_TYPE & 0xff) << 8

        compress = False
        compression = 0

        if self.compress:
            compress = True

        elif self.compress is False and message._MESSAGE_TYPE != RESPONSE:
            compress = True

        msg = message.pack()

        if compress and len(msg) >= COMPRESSION_THREASHOLD:
            compression = 1
            msg = lz4.compress(msg)

        header = version + msg_id + msg_type + compression
        data = _HEADER.pack(header, len(msg)) + msg
        self.sock.sendall(data)
        self.last_send = datetime.datetime.now()
Example #14
0
    def send(self, message):
        version = (0 & 0xf) << 28
        msg_id = message.msg_id

        if not msg_id:
            msg_id = next(self.msg_ids)
            message.msg_ig = msg_id

        msg_id = (msg_id & 0xfff) << 16
        msg_type = (message._MESSAGE_TYPE & 0xff) << 8

        compress = False
        compression = 0

        if self.compress:
            compress = True

        elif self.compress is False and message._MESSAGE_TYPE != RESPONSE:
            compress = True

        msg = message.pack()

        if compress and len(msg) >= COMPRESSION_THREASHOLD:
            compression = 1
            msg = lz4.compress(msg)

        header = version + msg_id + msg_type + compression
        data = _HEADER.pack(header, len(msg)) + msg
        self.sock.sendall(data)
        self.last_send = datetime.datetime.now()
Example #15
0
    def _compress(
        self, data
    ):  # a compression funcion like lrzip in spirit: lz4>lz0>zlib>bz2>lzma
        if self.shuffle == True:
            try:
                print "shuffling..."
                data = buff_shuffle(
                    data)  # shuffling will work for a filter < 1GB
                print "data shuffled..."
            except:
                pass

        print "Compressing..."
        try:
            data = lz4.compress(data)  # will fail if filter > 1GB
            print "lz4 ok"
        except:
            pass
        try:
            data = lzo.compress(data)  # will fail if filter > 1GB
            print "lzo ok"
        except:
            pass
        #data = data.encode('zlib')
        #data = zlib.compress(data,1)
        #data = zlib.compress(data,9)
        #data = bz2.compress(data,9)
        data = zlib.compress(data)
        data = bz2.compress(data)
        data = lzma.compress(data)
        return data
Example #16
0
def benchmark(data, hcdata=None):
    number = 100
    size = len(data)
    hcdata = hcdata or data

    for modname, func in [("pylz4", pylz4.compress), ("rustlz4", rustlz4.compress)]:
        timer = timeit.Timer(functools.partial(func, data))
        elapsed = timer.timeit(number=number)
        perf = size * number / elapsed / 1e6
        name = "%s.%s" % (modname, func.__name__)
        print("%24s: %8.2f MB/s" % (name, perf))

    for modname, func in [("pylz4", pylz4.compressHC), ("rustlz4", rustlz4.compresshc)]:
        timer = timeit.Timer(functools.partial(func, hcdata))
        elapsed = timer.timeit(number=number)
        perf = size * number / elapsed / 1e6
        name = "%s.%s" % (modname, func.__name__)
        print("%24s: %8.2f MB/s" % (name, perf))

    data = pylz4.compress(data)
    for modname, func in [("pylz4", pylz4.decompress), ("rustlz4", rustlz4.decompress)]:
        timer = timeit.Timer(functools.partial(func, data))
        elapsed = timer.timeit(number=number)
        perf = size * number / elapsed / 1e6
        name = "%s.%s" % (modname, func.__name__)
        print("%24s: %8.2f MB/s" % (name, perf))
Example #17
0
    def fset(self, inst, value):

        nprow = getattr(inst, 'NumpyArrayTable__' + self.name)
        #~ print 'fset',self.name,  nprow, value

        if nprow is None:
            nprow = self.NumpyArrayTableClass()
            setattr(inst, 'NumpyArrayTable__' + self.name, nprow)

        if value is None:
            if hasattr(inst, self.name + '_array'):
                delattr(inst, self.name + '_array')
            nprow.shape = None
            nprow.dtype = None
            nprow.blob = None
            nprow.units = None
            nprow.compress = None
            return

        if self.arraytype == np.ndarray:
            assert (type(value) == np.ndarray) or (
                type(value) == np.memmap
            ), 'Value is not np.array or np.memmap but {}'.format(type(value))
        if self.arraytype == pq.Quantity:
            assert type(
                value
            ) == pq.Quantity, '{} {} {} value is not pq.Quantity'.format(
                inst.__class__.__name__, self.name, value)

        shape = ('{},' * value.ndim)[:-1].format(*value.shape)
        if shape.endswith(','): shape = shape[:-1]
        nprow.shape = shape

        nprow.dtype = value.dtype.str

        if self.compress == 'blosc':
            blob = blosc.compress(value.tostring(),
                                  typesize=value.dtype.itemsize,
                                  clevel=9)
        else:
            if not value.flags['C_CONTIGUOUS']:
                buf = np.getbuffer(np.array(value, copy=True))
            else:
                buf = np.getbuffer(value)
            if self.compress == 'zlib':
                blob = zlib.compress(buf)
            elif self.compress == 'lz4':
                blob = lz4.compress(buf)
            elif self.compress == 'snappy':
                blob = snappy.compress(buf)
            else:
                blob = buf
        nprow.compress = self.compress
        nprow.blob = blob

        if self.arraytype == pq.Quantity:
            nprow.units = value.dimensionality.string

        setattr(inst, self.name + '_array', value)
Example #18
0
def chunks(inp):
	sr = dill.dumps(inp)
	inp = compress(sr)
	bs = 2**14
	out = []
	for x in range((len(inp) / bs) + 1):
		out.append(inp[(bs*x):(bs)*(x+1)])
	return out
Example #19
0
def compute_probabilities(replacement_idx, a, conditional_binary):
    """ 
    Compute the compressed length of a bit string with 
    number as bit a replacing the number as bit in 
    position replacement_idx
    """
    conditional_binary[replacement_idx] = a
    return len(lz4.compress(''.join(conditional_binary))) #54.65
Example #20
0
def roundtrip(size=None):
    if size is None:
        size = struct.unpack(">I", b"\0" + os.urandom(3))[0]
    data = os.urandom(size)
    assert rustlz4.decompress(pylz4.compress(data)) == data
    assert pylz4.decompress(buffer(rustlz4.compress(data))) == data
    assert rustlz4.decompress(pylz4.compressHC(data)) == data
    assert pylz4.decompress(buffer(rustlz4.compresshc(data))) == data
Example #21
0
 def process_document(self, session, doc):
     data = doc.get_raw(session)
     new = lz4.compress(data)
     return StringDocument(new,
                           self.id,
                           doc.processHistory,
                           parent=doc.parent,
                           filename=doc.filename)
Example #22
0
 def __init__(self, data_dir, obj_to_terms, obj_to_str, str_to_obj):
     self.data_dir = data_dir
     self.obj_to_terms = obj_to_terms
     self.obj_to_str = obj_to_str
     self.str_to_obj = str_to_obj
     self.id_term_map = None
     self.term_id_map = None
     self.objnum = 0
     try:
         import lz4 as compressor
         self.compress = compressor.compress
         self.compressHC = compressor.compressHC
         self.decompress = compressor.decompress
     except ImportError:
         import zlib as compressor
         self.compress = lambda data: compressor.compress(data, 3)
         self.compressHC = lambda data: compressor.compress(data, 9)
         self.decompress = lambda data: compressor.decompress(data)
Example #23
0
 def test_val_to_store_info_compress(self):
     """_val_to_store_info() should compress large values.
     """
     value = 'foo' * 32
     compressed_value = lz4.compress(value)
     client = memcache.Client('127.0.0.1', 11211, client_driver=NoopDriver)
     result = client._val_to_store_info(value, min_compress_len=1)
     self.assertEqual(result, (memcache.Client._FLAG_COMPRESSED,
                               compressed_value))
Example #24
0
def do_test_rountrip_method(compress, i_data, c_data=None):
    from lz4 import LZ4_uncompress  #@UnresolvedImport
    c = compress(i_data)
    if c_data is not None:
        assert c_data == c, "expected compressed data to look like %s, but got %s" % (
            hl(c_data), hl(c))
    d = LZ4_uncompress(c)
    assert d == i_data, "expected decompressed data to look like original %s, but got %s" % (
        hl(i_data), hl(d))
 def __init__(self, data_dir, obj_to_terms, obj_to_str, str_to_obj):
     self.data_dir = data_dir
     self.obj_to_terms = obj_to_terms
     self.obj_to_str = obj_to_str
     self.str_to_obj = str_to_obj
     self.id_term_map = None
     self.term_id_map = None
     self.objnum = 0
     try:
         import lz4 as compressor
         self.compress = compressor.compress
         self.compressHC = compressor.compressHC
         self.decompress = compressor.decompress
     except ImportError:
         import zlib as compressor
         self.compress = lambda data: compressor.compress(data, 3)
         self.compressHC = lambda data: compressor.compress(data, 9)
         self.decompress = lambda data: compressor.decompress(data)
Example #26
0
def _dump2stor(store, bucketname, data, compress):
    if len(data) == 0:
        return ""
    key = j.data.hash.md5_string(data)
    if not key in objects or not key in new_objects:
        if compress:
            data = lz4.compress(data)
        store.set_object(bucketname, key, data)
        new_objects.append(key)
    return key
Example #27
0
def compress(scheme, data):
    hdr = data[:4] + struct.pack(">L", (scheme << 27) + (len(data) & 0x07ffffff))
    if scheme == 0 :
        return data
    elif scheme == 1 and lz4:
        res = lz4.compress(hdr + data)
        return res
    else:
        warnings.warn("Table failed to compress by unsupported compression scheme")
    return data
Example #28
0
def _dump2stor(store, bucketname, data, compress):
        if len(data)==0:
            return ""       
        key = j.data.hash.md5_string(data)
        if not key in objects or not key in new_objects:
            if compress:
                data = lz4.compress(data)
            store.set_object(bucketname, key, data)
            new_objects.append(key)
        return key
Example #29
0
    def serialize_subarray(cls, subarray):
        if not subarray.flags['C_CONTIGUOUS']:
            subarray = subarray.copy(order='C')

        # Buffers larger than 1 GB would overflow
        # We could fix this by slicing each slice into smaller pieces...
        assert subarray.nbytes <= cls.MAX_LZ4_BUFFER_SIZE, \
            "FIXME: This class doesn't support compression of arrays whose slices are each > 1 GB"

        return lz4.compress(subarray)
Example #30
0
 def _put(self, item):
     key = next(self._idx).to_bytes(length=511,
                                    byteorder=sys.byteorder,
                                    signed=False)
     data = compress(pickle.dumps(
         item, protocol=4)) if self.compression else pickle.dumps(
             item, protocol=4)
     with self._sem:
         with self._lmdb.begin(write=True) as txn:
             txn.put(key, data, append=True, db=self._queue_db)
 def fset(self, inst, value):
     
     nprow = getattr(inst, 'NumpyArrayTable__'+self.name)
     #~ print 'fset',self.name,  nprow, value
     
     if nprow is None:
         nprow = self.NumpyArrayTableClass()
         setattr(inst, 'NumpyArrayTable__'+self.name, nprow)
     
     if value is None:
         if hasattr(inst, self.name+'_array') :
             delattr(inst, self.name+'_array')
         nprow.shape = None
         nprow.dtype = None
         nprow.blob = None
         nprow.units = None
         nprow.compress = None
         return 
     
     if self.arraytype == np.ndarray:
         assert (type(value) == np.ndarray) or (type(value) == np.memmap) , 'Value is not np.array or np.memmap but {}'.format(type(value))
     if self.arraytype == pq.Quantity:
         assert type(value) == pq.Quantity , '{} {} {} value is not pq.Quantity'.format(inst.__class__.__name__, self.name, value)
     
     shape = ('{},'*value.ndim)[:-1].format(*value.shape)
     if shape.endswith(',') : shape = shape[:-1]
     nprow.shape = shape
     
     nprow.dtype = value.dtype.str
     
     if self.compress == 'blosc':
         blob = blosc.compress(value.tostring(), typesize = value.dtype.itemsize, clevel= 9)
     else:
         if not value.flags['C_CONTIGUOUS']:
             #~ buf = np.getbuffer(np.array(value, copy = True))
             buf = np.array(value, copy=True).data
         else:     
             #~ buf = np.getbuffer(value)
             buf = value.data
         if self.compress == 'zlib':
             blob = zlib.compress(buf)
         elif self.compress == 'lz4':
             blob = lz4.compress(buf)
         elif self.compress == 'snappy':
             blob = snappy.compress(buf)
         else :
             blob = buf
     nprow.compress = self.compress
     nprow.blob = blob
     
     if self.arraytype == pq.Quantity:
         nprow.units = value.dimensionality.string
     
     setattr(inst, self.name+'_array', value)
Example #32
0
    def test_value_to_store_info_compress_length(self):
        """ _val_to_store_info() should not use compressed values if too long.

        That is, if the compressed value is longer than the original value, use
        the original value instead.
        """
        value = '...'
        compressed_value = lz4.compress(value)
        self.assertGreater(len(compressed_value), len(value))
        client = memcache.Client('127.0.0.1', 11211, client_driver=NoopDriver)
        result = client._val_to_store_info(value, min_compress_len=1)
        self.assertEqual(result, (0, value))
Example #33
0
        def write_stripe(f, compressed, header, padding=True):
            h = compress(marshal.dumps(header))
            assert len(h) < STRIPE_HEADER_SIZE
            f.write(struct.pack('I', len(h)))
            f.write(h)
            padding_size = STRIPE_SIZE - len(h) - 4
            for c in compressed:
                f.write(c)
                padding_size -= len(c)

            if padding:
                f.write('\0' * padding_size)
Example #34
0
        def write_stripe(f, compressed, header, padding=True):
            h = compress(marshal.dumps(header))
            assert len(h) < STRIPE_HEADER_SIZE
            f.write(struct.pack('I', len(h)))
            f.write(h)
            padding_size = STRIPE_SIZE - len(h) - 4
            for c in compressed:
                f.write(c)
                padding_size -= len(c)

            if padding:
                f.write('\0' * padding_size)
Example #35
0
    def _compress_as_lz4(self):
        if self._lz4_items is None:
            self._uncompress() # Ensure not currently compressed as draco
            compressed = []
            
            flat_vertices = self._vertices_zyx.reshape(-1)
            compressed.append( lz4.compress(flat_vertices) ) #@UndefinedVariable
            self._vertices_zyx = None
            
            flat_normals = self._normals_zyx.reshape(-1)
            compressed.append( lz4.compress(flat_normals) ) #@UndefinedVariable
            self._normals_zyx = None
    
            flat_faces = self._faces.reshape(-1)
            compressed.append( lz4.compress(flat_faces) ) #@UndefinedVariable
            self._faces = None

            # Compress twice: still fast, even smaller
            self._lz4_items = list(map(lz4.compress, compressed)) #@UndefinedVariable
        
        return sum(map(len, self._lz4_items))
Example #36
0
    def send(self, data, host, port):
        """Send raw data through a socket"""

        if not isinstance(data, dict) and not isinstance(data, list):
            raise TypeError("data must be either a list or a dictionary")

        # Getting raw data
        if self._use_lz4:
            data_raw = lz4.compress(self._codec.encode(data))
        else:
            data_raw = self._codec.encode(data)

        # Put the data on the wire as an UTF-8 JSON string
        self.sock.sendto(data_raw, (host, port))
Example #37
0
    def __init__(self, numpy_array):
        """Serializes and compresses the numpy array with LZ4"""

        self.raw_buffer = None  # only used if we can't compress
        self.compressed_label_blocks = None  # only used for label arrays of suitable shape
        self.compressed_mask_array = None  # only used for binary masks

        self.serialized_subarrays = []
        if numpy_array.flags['F_CONTIGUOUS']:
            self.layout = 'F'
        else:
            self.layout = 'C'

        if self.layout == 'F':
            numpy_array = numpy_array.transpose()

        self.dtype = numpy_array.dtype
        self.shape = numpy_array.shape

        # TODO: Also support compression of bool arrays via the special DVID binary compression
        if self.is_labels(numpy_array):
            self.compressed_label_blocks = serialize_uint64_blocks(numpy_array)
        elif self.dtype == np.bool and numpy_array.ndim == 3:
            # It turns out that encode_mask_array + lz4.compress is better than
            # lz4.compression alone (even multiple rounds of lz4 alone)
            self.compressed_mask_array = lz4.compress(
                encode_mask_array(numpy_array))
        else:

            if numpy_array.ndim <= 1:
                slice_bytes = numpy_array.nbytes
            else:
                slice_bytes = numpy_array[0].nbytes

            if slice_bytes > CompressedNumpyArray.MAX_LZ4_BUFFER_SIZE:
                warnings.warn(
                    "Array is too large to compress -- not compressing.")
                if not numpy_array.flags['C_CONTIGUOUS']:
                    numpy_array = numpy_array.copy(order='C')
                self.raw_buffer = bytearray(numpy_array)
            else:
                # For 2D, 1D or 0D arrays, serialize everything in one buffer.
                if numpy_array.ndim <= 2:
                    self.serialized_subarrays.append(
                        self.serialize_subarray(numpy_array))
                else:
                    # For ND arrays, serialize each slice independently, to ease RAM usage
                    for subarray in numpy_array:
                        self.serialized_subarrays.append(
                            self.serialize_subarray(subarray))
 def __init__(self, data_dir):
     # term = str()
     # triple = str()
     # args(triple) = (int)
     self.data_dir = data_dir
     # table: id(term) -> term
     self.term_id_map = None
     # table: id(triple) -> args(triple)
     self.triple_id_map = None
     # table: id(term) -> args(triple)
     self.arg_cache = None
     self.rel_id_map = REL_NAME_ID_MAP
     self.id_rel_map = REL_ID_NAME_MAP
     try:
         import lz4 as compressor
         self.compress = compressor.compress
         self.compressHC = compressor.compressHC
         self.decompress = compressor.decompress
     except ImportError:
         import zlib as compressor
         self.compress = lambda data: compressor.compress(data, 3)
         self.compressHC = lambda data: compressor.compress(data, 9)
         self.decompress = lambda data: compressor.decompress(data)
Example #39
0
 def __init__(self, data_dir):
     # term = str()
     # triple = str()
     # args(triple) = (int)
     self.data_dir = data_dir
     # table: id(term) -> term
     self.term_id_map = None
     # table: id(triple) -> args(triple)
     self.triple_id_map = None
     # table: id(term) -> args(triple)
     self.arg_cache = None
     self.rel_id_map = REL_NAME_ID_MAP
     self.id_rel_map = REL_ID_NAME_MAP
     try:
         import lz4 as compressor
         self.compress = compressor.compress
         self.compressHC = compressor.compressHC
         self.decompress = compressor.decompress
     except ImportError:
         import zlib as compressor
         self.compress = lambda data: compressor.compress(data, 3)
         self.compressHC = lambda data: compressor.compress(data, 9)
         self.decompress = lambda data: compressor.decompress(data)
Example #40
0
def save_book_raw_data(book_raw_data_obj):

    if use_compression:
        f = open(
            join("/Volumes/NewVolume/Emotional-Arcs/database/cache",
                 str(book_raw_data_obj.this_Book.pk) + ".p.lz4"), "wb")
        f.write(
            lz4.compress(
                pickle.dumps(book_raw_data_obj, pickle.HIGHEST_PROTOCOL)))
        f.close()
    else:
        f = open(
            join("/Volumes/NewVolume/Emotional-Arcs/database/cache",
                 str(book_raw_data_obj.this_Book.pk) + ".p"), "wb")
        f.write(pickle.dumps(book_raw_data_obj, pickle.HIGHEST_PROTOCOL))
        f.close()
Example #41
0
def save_book_raw_data(book_raw_data_obj):

    if use_compression:
        f = open(
            join("/Users/andyreagan/projects/2014/09-books/data/cache",
                 str(book_raw_data_obj.this_Book.pk) + ".p.lz4"), "wb")
        f.write(
            lz4.compress(
                pickle.dumps(book_raw_data_obj, pickle.HIGHEST_PROTOCOL)))
        f.close()
    else:
        f = open(
            join("/Users/andyreagan/projects/2014/09-books/data/cache",
                 str(book_raw_data_obj.this_Book.pk) + ".p"), "wb")
        f.write(pickle.dumps(book_raw_data_obj, pickle.HIGHEST_PROTOCOL))
        f.close()
Example #42
0
    def push_to_s3(self, compress=True):
        """Upload object to Amazon s3 bucket"""

        s3 = self.connect_and_check_bucket()

        # Archive
        logging.info('Archiving {}...'.format(self.name))
        tar_path = os.path.join(self.PATH, '{}.tar'.format(self.name))
        lz4_path = os.path.join(self.PATH, '{}.tar.lz4'.format(self.name))

        with tarfile.open(tar_path, 'w') as in_file:
            for filename in glob.glob(
                    os.path.join(self.PATH,
                                 '{stem}.*'.format(stem=self.name))):
                in_file.add(filename, arcname=os.path.split(filename)[1])

        # Compress
        if compress:
            logging.info('{} LZ4 compressing...'.format(self.name))
            with open(tar_path, 'rb') as in_file:
                with open(lz4_path, 'wb') as out_file:
                    out_file.write(lz4.compress(in_file.read()))
            upload_path = lz4_path
        else:
            upload_path = tar_path

        # Get key
        key = self.get_key(compress=compress)

        # Upload
        logging.info('Uploading {} on s3...'.format(self.name))
        s3.meta.client.upload_file(upload_path,
                                   self.BUCKET_NAME,
                                   key,
                                   Callback=self.ProgressPercentage(
                                       float(os.path.getsize(upload_path))))

        # Clean
        os.remove(tar_path)
        if compress:
            os.remove(lz4_path)
        logging.info('Upload {} DONE'.format(self.name))
Example #43
0
def compress(data, clibs=[]):
    if len(clibs) == 0:
        return (False, None, data)
    for c in clibs:
        try:
            if c == "zlib":
                import zlib
                return (True, "zlib", zlib.compress(data))
            elif c == "lz4":
                import lz4
                return (True, "zlib", lz4.compress(data))
            elif c == "snappy":
                import pysnappy
                return (True, "snappy", pysnappy.compress(data))
            else:
                import zlib
                return (True, "zlib", zlib.compress(data))
        except:
            return (False, None, data)
    return (False, None, data)
Example #44
0
    def open_db(self):
        self.terms_ldb = leveldb.LevelDB(self.terms_fl)
        self.docs_ldb = leveldb.LevelDB(self.docs_fl)

        self.doc_buffer_size = 0
        self.term_buffer_size = 0

        #self.doc_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype="S%d" % self.max_doc_size)
        self.doc_flush_buffer = [None] * self.max_doc_flush_buffer_size
        self.term_flush_buffer = np.empty(self.max_term_flush_buffer_size,
                                          dtype="S%d" % self.max_term_size)
        self.doc_id_flush_buffer = np.empty(self.max_doc_flush_buffer_size,
                                            dtype=np.int64)
        self.term_id_flush_buffer = np.empty(self.max_term_flush_buffer_size,
                                             dtype=np.int64)

        if self.compression == COMPRESSION.NONE:
            self.compress = lambda string: string
            self.decompress = lambda string: string
        elif self.compression == COMPRESSION.ZLIB:
            import zlib
            self.compress = lambda string: zlib.compress(
                string, self.compression_level)
            self.decompress = lambda string: zlib.decompress(string)
        elif self.compression == COMPRESSION.LZMA:
            import backports.lzma as lzma
            self.compress = lambda string: lzma.compress(
                bytearray(string), format=lzma.FORMAT_RAW)
            self.decompress = lambda data: lzma.decompress(
                data, format=lzma.FORMAT_RAW)
        elif self.compression == COMPRESSION.LZ4R:
            import lz4
            self.compress = lambda string: lz4.compress(string)
            self.decompress = lambda string: lz4.decompress(string)
        elif self.compression == COMPRESSION.LZ4H:
            import lz4
            self.compress = lambda string: lz4.compressHC(string)
            self.decompress = lambda string: lz4.decompress(string)
        else:
            raise Exception("Wrong compression type %r" % self.compression)
Example #45
0
    def compress_fixture(self, fixture_lz4_path, with_header):
        """
        Compress json fixture with lz4

        fixture_lz4_path:
        The path to the compressed fixture

        with_header:
        Specify if you want to have the mozilla header
        at the start of the compressed file
        """
        fixture_json_path = 'fixtures/fixture.json'

        with open(fixture_json_path, mode='rb') as json_file:
            data = json_file.read()
            compressed_data = lz4.compress(data)

            with open(fixture_lz4_path, mode='wb') as lz4_file:
                if with_header:
                    header = b'mozLz40\0'
                    lz4_file.write(header)
                lz4_file.write(compressed_data)
Example #46
0
    def _val_to_store_info(self, val, min_compress_len):
        """
        Transform val to a storable representation, returning a tuple of the
        flags, the length of the new value, and the new value itself.
        """
        flags = 0
        if isinstance(val, str):
            pass
        elif isinstance(val, int):
            flags |= Client._FLAG_INTEGER
            val = "%d" % val
            # maxint is pretty tiny. just return
            return (flags, val)
        elif isinstance(val, long):
            flags |= Client._FLAG_LONG
            val = "%d" % val
            # longs can be huge, so check length and compress if long enough
        else:
            if self.pickle:
                flags |= Client._FLAG_PICKLE
                val = pickle.dumps(val, self.pickle_proto)

        lv = len(val)
        #  do not store if value length exceeds maximum
        if self.max_value_length and lv > self.max_value_length:
            raise MemcacheValueError(
                "Value is larger than configured max_value_length. %d > %d" %
                (lv, self.max_value_length))

        # We should try to compress if min_compress_len > 0 and this
        # string is longer than min threshold.
        if min_compress_len and lv > min_compress_len:
            comp_val = lz4.compress(val)
            # Only actually compress if the compressed result is smaller
            # than the original.
            if len(comp_val) < lv:
                flags |= Client._FLAG_COMPRESSED
                val = comp_val
        return (flags, val)
Example #47
0
    def compress_fixture(self, fixture_lz4_path, with_header):
        """
        Compress json fixture with lz4

        fixture_lz4_path:
        The path to the compressed fixture

        with_header:
        Specify if you want to have the mozilla header
        at the start of the compressed file
        """
        fixture_json_path = 'fixtures/fixture.json'

        with open(fixture_json_path, mode='rb') as json_file:
            data = json_file.read()
            compressed_data = lz4.compress(data)

            with open(fixture_lz4_path, mode='wb') as lz4_file:
                if with_header:
                    header = b'mozLz40\0'
                    lz4_file.write(header)
                lz4_file.write(compressed_data)
Example #48
0
def run():
    fileDir = '/root/thermite/target/release/thermite_test_Neil'
    ratioFile_path = sys.argv[1]
    ratioList = []

    openData = open(fileDir, "rb")
    ratioFile = open(ratioFile_path, "w")

    vvfd = os.open(fileDir, os.O_RDONLY)
    vvData = os.lseek(vvfd, 0, os.SEEK_END)

    upper_limit = vvData / 4096

    for x in xrange(0, upper_limit, 1):
        openData.seek(4096 * x, 0)
        data = openData.read(4096)
        compOutput = lz4.compress(data)
        sys.getsizeof(compOutput)
        ratio = sys.getsizeof(compOutput) / float(sys.getsizeof(data))
        ratioFile.write(str(ratio) + '\n')

    ratioFile.close()
Example #49
0
def pack(fname):

    global total, compressed


    f = open(fname)
    data = f.read()
    f.close()
    checksum = siphashc.siphash('\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', data)
    fsize = len(data)
    data = lz4.compress(data)

    # size of packet(4), checksum(8), fnamelen(2)+fname, uncompressed size(4), compressed data
    l = len(data)
    pktlen = 4 + 8 + 2 + len(fname) + 4 + l

    total += fsize
    compressed += len(data)

    sys.stderr.write("%s: %d -> %d\n"  %(fname, fsize, len(data)))
    sys.stdout.write( struct.pack('<IQH%dsI%ds' % (len(fname), l), pktlen, checksum, len(fname), fname,fsize,data))
    sys.stdout.flush()
Example #50
0

    for line in bz2_fh:
        line = line.decode('utf-8')
        title_match = title_re.match(line)
        if title_match:
            if 'lat_d' in old_coords and 'long_d' in old_coords:
                lat, lng = normalize_coords(**old_coords)
                j += 1
            if lat and lng:
                abstract, img = extract_abstract(text)
                rank = len(text)
                #print("\t".join(map(str, (title, lat, lng))))
                print(title)
                try:
                    POI(name=title, at=[lng, lat], abstract=lz4.compress(abstract), rank=rank, img=img).save()
                    #POI(name=title, at=[lng, lat], abstract=abstract).save()
                except Exception as e:
                    print("Insert error:", str(e), title, lat, lng, file=sys.stderr)
#                    raise
#                print("Begin abstract")
#                print(abstract)
#                print("End abstract")
            title = title_match.group(1)
            coords, old_coords, lat, lng, text, in_text = None, {}, None, None, '', False
            continue
        coord_match = coord_re.match(line)
        if coord_match:
            try:
                lat, lng = coord2latlng(line)
#                print("\t".join(map(str, (title, lat, lng))))
Example #51
0
import uuid
import timeit
import lz4
import snappy
import os
from timeit import Timer

DATA = open("../src/lz4.c", "rb").read()
LZ4_DATA = lz4.compress(DATA)
SNAPPY_DATA = snappy.compress(DATA)
LOOPS = 200000

print("Data Size:")
print("  Input: %d" % len(DATA))
print("  LZ4: %d (%.2f)" % (len(LZ4_DATA), len(LZ4_DATA) / float(len(DATA))))
print("  Snappy: %d (%.2f)" % (len(SNAPPY_DATA), len(SNAPPY_DATA) / float(len(DATA))))
print("  LZ4 / Snappy: %f" % (float(len(LZ4_DATA)) / float(len(SNAPPY_DATA))))

print("Benchmark: %d calls" % LOOPS)
print("  LZ4 Compression: %fs" % Timer("lz4.compress(DATA)", "from __main__ import DATA; import lz4").timeit(number=LOOPS))
print("  Snappy Compression: %fs" % Timer("snappy.compress(DATA)", "from __main__ import DATA; import snappy").timeit(number=LOOPS))
print("  LZ4 Decompression: %fs" % Timer("lz4.uncompress(LZ4_DATA)", "from __main__ import LZ4_DATA; import lz4").timeit(number=LOOPS))
print("  Snappy Decompression : %fs" % Timer("snappy.uncompress(SNAPPY_DATA)", "from __main__ import SNAPPY_DATA; import snappy").timeit(number=LOOPS))
Example #52
0
 def process_document(self, session, doc):
     data = doc.get_raw(session)
     new = lz4.compress(data)
     return StringDocument(new, self.id, doc.processHistory,
                           parent=doc.parent, filename=doc.filename)
Example #53
0
def test_roundtripLZ4():
    _str = "hello world"
    cstr = lz4.compress(_str)
    assert _str == c.decompress(cstr)
Example #54
0
def lz4_pickle_dump(data, filename):
    path = pathlib.Path(filename)
    with path.open('wb') as f:
        f.write(lz4.compress(pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)))
    def _test_block(labels, test_name):
        
        # labelarray
        with Timer() as timer:
            dvid_encoded_list = serialize_uint64_blocks(labels)
        dvid_encoded_bytes = sum(map(len, dvid_encoded_list))
        dvid_enc_time = timer.seconds
        dvid_enc_throughput = (labels.nbytes / dvid_enc_time) / 1e6
    
        with Timer() as timer:
            decoded = deserialize_uint64_blocks(dvid_encoded_list, labels.shape)
        assert (decoded == labels).all()
        dvid_dec_time = timer.seconds
        dvid_dec_throughput = (labels.nbytes / dvid_dec_time) / 1e6

        # DVID + gzip
        with Timer() as timer:
            gzipped_dvid_encoded_list = list(map(gzip.compress, dvid_encoded_list))
        gzipped_dvid_enc_time = timer.seconds + dvid_enc_time
        gzipped_dvid_enc_throughput = (labels.nbytes / gzipped_dvid_enc_time) / 1e6
        
        gzipped_dvid_encoded_bytes = sum(map(len, gzipped_dvid_encoded_list))
        print("+ GZIP:", gzipped_dvid_encoded_bytes)
        print(f"Compression ratio: {labels.nbytes/gzipped_dvid_encoded_bytes:.1f}x")
        print(f"DVID+GZIP encode throughput: {gzipped_dvid_enc_throughput} MB/s")
 
        with Timer() as timer:
            unzippped = list(map(gzip.decompress, gzipped_dvid_encoded_list))
        assert (decoded == labels).all()
        gzipped_dvid_dec_time = timer.seconds + dvid_dec_time
        gzipped_dvid_dec_throughput = (labels.nbytes / gzipped_dvid_dec_time) / 1e6
        print(f"DVID+GZIP decode throughput: {gzipped_dvid_dec_throughput} MB/s")

        # DVID + LZ4
        with Timer() as timer:
            lz4_dvid_encoded_list = list(map(lz4.compress, dvid_encoded_list))
        lz4_dvid_enc_time = timer.seconds + dvid_enc_time
        lz4_dvid_enc_throughput = (labels.nbytes / lz4_dvid_enc_time) / 1e6
        
        lz4_dvid_encoded_bytes = sum(map(len, lz4_dvid_encoded_list))
        print("+ LZ4:", lz4_dvid_encoded_bytes)
        print(f"Compression ratio: {labels.nbytes/lz4_dvid_encoded_bytes:.1f}x")
        print(f"DVID+LZ4 encode throughput: {lz4_dvid_enc_throughput} MB/s")
 
        with Timer() as timer:
            unzippped = list(map(lz4.decompress, lz4_dvid_encoded_list))
        assert (decoded == labels).all()
        lz4_dvid_dec_time = timer.seconds + dvid_dec_time
        lz4_dvid_dec_throughput = (labels.nbytes / lz4_dvid_dec_time) / 1e6
        print(f"DVID+LZ4 decode throughput: {lz4_dvid_dec_throughput} MB/s")

        # lz4
        with Timer() as timer:
            lz4_encoded = lz4.compress(labels)
        lz4_encoded_bytes = len(lz4_encoded)
        lz4_enc_time = timer.seconds
        lz4_enc_throughput = (labels.nbytes / lz4_enc_time) / 1e6
    
        with Timer() as timer:
            lz4_decoded = lz4.decompress(lz4_encoded)
        decoded_labels = np.frombuffer(lz4_decoded, np.uint64).reshape(labels.shape)
        assert (decoded_labels == labels).all()
        lz4_dec_time = timer.seconds
        lz4_dec_throughput = (labels.nbytes / lz4_dec_time) / 1e6

        
        global HEADER_PRINTED
        if not HEADER_PRINTED:
            print(f"{'':>20s} {'______ ENCODED BYTES ______ ':^41s}  | {'______ ENCODING TIME ______ ':^77s} | {'______ DECODING TIME ______ ':^77s} |")
            print(f"{'':>20s} {'LZ4':>10s} {'DVID':>10s} {'D+G':>10s} {'DECREASE':>9s} |"
                  f"{'------- LZ4 -------':>22s} {'------ DVID ------':>22s} {'---- DVID+GZIP ----':>22s} {'SLOWDOWN':>9s} |"
                  f"{'------- LZ4 -------':>22s} {'------ DVID ------':>22s} {'---- DVID+GZIP ----':>22s} {'SLOWDOWN':>9s} |")
            HEADER_PRINTED = True

        print(f"{test_name:>19s}: {lz4_encoded_bytes: 10d} {dvid_encoded_bytes: 10d} {lz4_encoded_bytes/dvid_encoded_bytes:8.1f}x |"
              f"{lz4_enc_time:6.2f}s ({lz4_enc_throughput:7.1f} MB/s) {dvid_enc_time:6.2f}s ({dvid_enc_throughput:7.1f} MB/s) {dvid_enc_time/lz4_enc_time:8.1f}x |"
              f"{lz4_dec_time:6.2f}s ({lz4_dec_throughput:7.1f} MB/s) {dvid_dec_time:6.2f}s ({dvid_dec_throughput:7.1f} MB/s) {dvid_dec_time/lz4_dec_time:8.1f}x |")
Example #56
0
def test_lz4_4_2():
    lz4.compress(garbage100k)
Example #57
0
 def lz4_compress(byts):
     # write length in big-endian instead of little-endian
     return int32_pack(len(byts)) + lz4.compress(byts)[4:]
from mokujin.resource import StopList
from mokujin.misc import transliterate_ru
from mokujin.resource import ConceptNetList
from mokujin.index import TripleSearchEngine
from mokujin.sourcesearch import TripleStoreExplorer
from mokujin.patternsearch import PatternCollection

try:
    import lz4 as comp
    comp_format = "lz4"
    compress = comp.compressHC
    decompress = comp.decompress
except ImportError:
    import zlib as comp
    comp_format = "zip"
    compress = lambda string: comp.compress(string, 9)
    decompress = comp.decompress

if __name__ == "__main__":

    logging.basicConfig(level=logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--index", default="data/index", help="Triple store index directory", type=str)
    parser.add_argument("-o", "--outputdir", default="output",
                        help="Directory where script's ouput will be placed",  type=str)
    parser.add_argument("-q", "--queryterm", default=None, help="Query term", type=str)
    parser.add_argument("-qf", "--queryterms_file", default=None, help="File with query terms", type=str)
    parser.add_argument("-s", "--stoplist", default="resources/word.freq.ru.csv", help="Stop list file", type=str)
    parser.add_argument("-ts", "--t_stop", default=500, help="Stop words frequency threshold", type=float)
    parser.add_argument("-tt", "--t_triple", default=5, help="Min frequency treshold for target triples", type=float)