def snappy_pack_blob(obj, sep=SEP): if obj is None: return "" c = obj.dtype.char if c == "S": return "S" + snappy.compress(sep.join(obj)) return buffer(c + snappy.compress(obj.tobytes()))
def snappy_pack_blob(obj, sep=SEP): if obj is None: return '' c = obj.dtype.char if c == 'S' or c == 'U': return b'U' + snappy.compress(sep.join(obj)) return buffer( c.encode('utf-8') + snappy.compress(obj.tobytes(), 'utf8'))
def snappy_encode(payload, xerial_compatible=False, xerial_blocksize=32 * 1024): """Encodes the given data with snappy compression. If xerial_compatible is set then the stream is encoded in a fashion compatible with the xerial snappy library. The block size (xerial_blocksize) controls how frequent the blocking occurs 32k is the default in the xerial library. The format winds up being: +-------------+------------+--------------+------------+--------------+ | Header | Block1 len | Block1 data | Blockn len | Blockn data | +-------------+------------+--------------+------------+--------------+ | 16 bytes | BE int32 | snappy bytes | BE int32 | snappy bytes | +-------------+------------+--------------+------------+--------------+ It is important to note that the blocksize is the amount of uncompressed data presented to snappy at each block, whereas the blocklen is the number of bytes that will be present in the stream; so the length will always be <= blocksize. """ if not has_snappy(): raise NotImplementedError("Snappy codec is not available") if xerial_compatible: def _chunker(): for i in xrange(0, len(payload), xerial_blocksize): yield payload[i:i + xerial_blocksize] out = BytesIO() header = b''.join([ struct.pack('!' + fmt, dat) for fmt, dat in zip(_XERIAL_V1_FORMAT, _XERIAL_V1_HEADER) ]) out.write(header) for chunk in _chunker(): block = snappy.compress(chunk) block_size = len(block) out.write(struct.pack('!i', block_size)) out.write(block) out.seek(0) return out.read() else: return snappy.compress(payload)
def test_view_snappy_compressed(): if not snappy_installed(): raise SkipTest import snappy cluster = pseudo_hdfs4.shared_cluster() finish = [] try: c = make_logged_in_client() cluster.fs.setuser(cluster.superuser) if cluster.fs.isdir('/tmp/test-snappy-filebrowser'): cluster.fs.rmtree('/tmp/test-snappy-filebrowser') cluster.fs.mkdir('/tmp/test-snappy-avro-filebrowser/') f = cluster.fs.open('/tmp/test-snappy-filebrowser/test-view.snappy', "w") f.write(snappy.compress('This is a test of the emergency broadcasting system.')) f.close() f = cluster.fs.open('/tmp/test-snappy-filebrowser/test-view.stillsnappy', "w") f.write(snappy.compress('The broadcasters of your area in voluntary cooperation with the FCC and other authorities.')) f.close() f = cluster.fs.open('/tmp/test-snappy-filebrowser/test-view.notsnappy', "w") f.write('foobar') f.close() # Snappy compressed fail response = c.get('/filebrowser/view/tmp/test-snappy-filebrowser/test-view.notsnappy?compression=snappy') assert_true('Failed to decompress' in response.context['message'], response) # Snappy compressed succeed response = c.get('/filebrowser/view/tmp/test-snappy-filebrowser/test-view.snappy') assert_equal('snappy', response.context['view']['compression']) assert_equal(response.context['view']['contents'], 'This is a test of the emergency broadcasting system.', response) # Snappy compressed succeed response = c.get('/filebrowser/view/tmp/test-snappy-filebrowser/test-view.stillsnappy') assert_equal('snappy', response.context['view']['compression']) assert_equal(response.context['view']['contents'], 'The broadcasters of your area in voluntary cooperation with the FCC and other authorities.', response) # Largest snappy compressed file finish.append( MAX_SNAPPY_DECOMPRESSION_SIZE.set_for_testing(1) ) response = c.get('/filebrowser/view/tmp/test-snappy-filebrowser/test-view.stillsnappy?compression=snappy') assert_true('File size is greater than allowed max snappy decompression size of 1' in response.context['message'], response) finally: for done in finish: done() try: cluster.fs.rmtree('/test-snappy-avro-filebrowser/') except: pass # Don't let cleanup errors mask earlier failures
def transmit(result, sock): pickler = pickle.Pickler(sock) cols = list(result.keys()) pickler.dump(cols) for col in cols: if (result[col].dtype == object): colz = snappy.compress(pickle.dumps(result[col])) else: colz = snappy.compress(result[col]) pickler.dump(result[col].dtype) pickler.dump(colz)
def record_question(self, post: dict): # update set of users_ids (users with activity) for user_id in post.get("users_ids"): self._all_users_ids.add(user_id) # add this postId to the ordered list of questions sorted by score self.pipe.zadd(self.questions_key(), mapping={post["Id"]: post["Score"]}, nx=True) # Add this question's PostId to the ordered questions sets of all its tags for tag in post.get("Tags", []): self.pipe.zadd(self.tag_key(tag), mapping={post["Id"]: post["Score"]}, nx=True) # store int for user Ids (most use) to save some space in redis # names stored as str thus belong to deleted users. this prevents del users # with a name such as "3200" to be considered User#3200 if post.get("OwnerUserId"): post["OwnerName"] = int(post["OwnerUserId"]) # store question details self.pipe.setnx( self.question_key(post["Id"]), snappy.compress( json.dumps(( post["CreationTimestamp"], post["OwnerName"], post["has_accepted"], post["nb_answers"], # Tag ID can be None in the event a Tag existed and was not used # but got used first during the dumping process, after the Tags # were dumped but before questions we fully dumped. # SO Tag `imac` in 2021-06 dumps for instance [ self.get_tag_id(tag) for tag in post.get("Tags", []) if self.get_tag_id(tag) ], ))), ) # record question's meta: ID: title, excerpt for use in home and tag pages self.pipe.set( self.question_details_key(post["Id"]), snappy.compress( json.dumps((post["Title"], get_text(post["Body"], strip_at=250)))), ) self.bump_seen(4 + len(post.get("Tags", []))) self.commit_maybe()
def snappy_encode(payload, xerial_compatible=False, xerial_blocksize=32*1024): """Encodes the given data with snappy compression. If xerial_compatible is set then the stream is encoded in a fashion compatible with the xerial snappy library. The block size (xerial_blocksize) controls how frequent the blocking occurs 32k is the default in the xerial library. The format winds up being: +-------------+------------+--------------+------------+--------------+ | Header | Block1 len | Block1 data | Blockn len | Blockn data | +-------------+------------+--------------+------------+--------------+ | 16 bytes | BE int32 | snappy bytes | BE int32 | snappy bytes | +-------------+------------+--------------+------------+--------------+ It is important to note that the blocksize is the amount of uncompressed data presented to snappy at each block, whereas the blocklen is the number of bytes that will be present in the stream; so the length will always be <= blocksize. """ if not has_snappy(): raise NotImplementedError("Snappy codec is not available") if xerial_compatible: def _chunker(): for i in xrange(0, len(payload), xerial_blocksize): yield payload[i:i+xerial_blocksize] out = BytesIO() header = b''.join([struct.pack('!' + fmt, dat) for fmt, dat in zip(_XERIAL_V1_FORMAT, _XERIAL_V1_HEADER)]) out.write(header) for chunk in _chunker(): block = snappy.compress(chunk) block_size = len(block) out.write(struct.pack('!i', block_size)) out.write(block) out.seek(0) return out.read() else: return snappy.compress(payload)
def snappy_encode(payload, xerial_compatible=False, xerial_blocksize=32 * 1024): """ Compress the given data with the Snappy algorithm. :param bytes payload: Data to compress. :param bool xerial_compatible: If set then the stream is broken into length-prefixed blocks in a fashion compatible with the xerial snappy library. The format winds up being:: +-------------+------------+--------------+------------+--------------+ | Header | Block1_len | Block1 data | BlockN len | BlockN data | |-------------+------------+--------------+------------+--------------| | 16 bytes | BE int32 | snappy bytes | BE int32 | snappy bytes | +-------------+------------+--------------+------------+--------------+ :param int xerial_blocksize: Number of bytes per chunk to independently Snappy encode. 32k is the default in the xerial library. :returns: Compressed bytes. :rtype: :class:`bytes` """ if not has_snappy( ): # FIXME This should be static, not checked every call. raise NotImplementedError("Snappy codec is not available") if xerial_compatible: def _chunker(): for i in range(0, len(payload), xerial_blocksize): yield payload[i:i + xerial_blocksize] out = BytesIO() out.write(_XERIAL_HEADER) for chunk in _chunker(): block = snappy.compress(chunk) out.write(struct.pack('!i', len(block))) out.write(block) out.seek(0) return out.read() else: return snappy.compress(payload)
def encode_snappy(buff, xerial_compatible=False, xerial_blocksize=32 * 1024): """Encode a buffer using snappy If xerial_compatible is set, the buffer is encoded in a fashion compatible with the xerial snappy library. The block size (xerial_blocksize) controls how frequently the blocking occurs. 32k is the default in the xerial library. The format is as follows: +-------------+------------+--------------+------------+--------------+ | Header | Block1 len | Block1 data | Blockn len | Blockn data | |-------------+------------+--------------+------------+--------------| | 16 bytes | BE int32 | snappy bytes | BE int32 | snappy bytes | +-------------+------------+--------------+------------+--------------+ It is important to note that `blocksize` is the amount of uncompressed data presented to snappy at each block, whereas `blocklen` is the number of bytes that will be present in the stream. Adapted from kafka-python https://github.com/mumrah/kafka-python/pull/127/files """ #snappy segfaults if it gets a read-only buffer on PyPy if IS_PYPY or PY3: buff = bytes(buff) if snappy is None: raise ImportError("Please install python-snappy") if xerial_compatible: def _chunker(): for i in range(0, len(buff), xerial_blocksize): yield buff[i:i + xerial_blocksize] out = BytesIO() full_data = list(zip(_XERIAL_V1_FORMAT, _XERIAL_V1_HEADER)) header = b''.join( [struct.pack('!' + fmt, dat) for fmt, dat in full_data]) out.write(header) for chunk in _chunker(): block = snappy.compress(chunk) block_size = len(block) out.write(struct.pack('!i', block_size)) out.write(block) out.seek(0) return out.read() else: return snappy.compress(buff)
def encode_snappy(buff, xerial_compatible=False, xerial_blocksize=32 * 1024): """Encode a buffer using snappy If xerial_compatible is set, the buffer is encoded in a fashion compatible with the xerial snappy library. The block size (xerial_blocksize) controls how frequently the blocking occurs. 32k is the default in the xerial library. The format is as follows: +-------------+------------+--------------+------------+--------------+ | Header | Block1 len | Block1 data | Blockn len | Blockn data | |-------------+------------+--------------+------------+--------------| | 16 bytes | BE int32 | snappy bytes | BE int32 | snappy bytes | +-------------+------------+--------------+------------+--------------+ It is important to note that `blocksize` is the amount of uncompressed data presented to snappy at each block, whereas `blocklen` is the number of bytes that will be present in the stream. Adapted from kafka-python https://github.com/mumrah/kafka-python/pull/127/files """ #snappy segfaults if it gets a read-only buffer on PyPy if IS_PYPY or PY3: buff = bytes(buff) if snappy is None: raise ImportError("Please install python-snappy") if xerial_compatible: def _chunker(): for i in range(0, len(buff), xerial_blocksize): yield buff[i:i + xerial_blocksize] out = BytesIO() full_data = list(zip(_XERIAL_V1_FORMAT, _XERIAL_V1_HEADER)) header = b''.join( [struct.pack('!' + fmt, dat) for fmt, dat in full_data ]) out.write(header) for chunk in _chunker(): block = snappy.compress(chunk) block_size = len(block) out.write(struct.pack('!i', block_size)) out.write(block) out.seek(0) return out.read() else: return snappy.compress(buff)
def update_post(username, slug): user = current_user content = request.form.get('content', type=str) cursor = request.form.get('cursor', type=int) if content is not None: post = user.posts.filter_by(slug=slug).first() if post: post.cursor = len(content) if not cursor else cursor post.modified_timestamp = datetime.utcnow() # Get meta r = regex.compile(r'<<((?:(?>[^<>]+)|<(?!<)|>(?!>))*?)>>', regex.I | regex.S) post.meta = json.dumps(regex.findall(r, content)) # Encrypt half_key = session[generate_hash(user.user_key_salt)] key = xor_keys(half_key, app.config['MASTER_KEY']) content = snappy.compress(content) content = AES_encrypt(key, user.username, content) post.content = content db.session.add(post) db.session.commit() return jsonify(error=None) return jsonify(error="Not found") elif cursor is not None: post = user.posts.filter_by(slug=slug).first() if post: post.cursor = cursor db.session.add(post) db.session.commit() return jsonify(error=None) return jsonify(error="Not found") return jsonify(error="Invalid parameters")
def write_key(ds, kind, id, data_path): key = ds.key(kind, id) entity = datastore.Entity( key=key, exclude_from_indexes=['Value']) with open(data_path) as f: data = json.load(f) payload = { 'LastModified': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'SchemaVersion': '', 'DataType': data['dataType'], 'Season': data['season']['name'], 'Sport': data['sport'] if 'sport' in data else '', 'League': data['league']['alias'], 'TeamId': str(data['team']['id']), 'PlayerId': data['player']['id'] if 'player' in data else '', 'EventId': data['eventId'] if 'eventId' in data else '', 'EventDate': data['eventDate'] if 'eventDate' in data else '', 'EventType': data['eventType'] if 'eventType' in data else '', 'Value': snappy.compress(msgpack.packb(data)) } print payload entity.update(payload) ds.put(entity)
def read(self, _): # Stop copying when the bevy is full. if self.chunk_count_in_bevy >= self.owner.chunks_per_segment: return "" chunk = self.stream.read(self.owner.chunk_size) if not chunk: return "" self.size += len(chunk) if self.owner.compression == lexicon.AFF4_IMAGE_COMPRESSION_ZLIB: compressed_chunk = zlib.compress(chunk) elif (snappy and self.owner.compression == lexicon.AFF4_IMAGE_COMPRESSION_SNAPPY): compressed_chunk = snappy.compress(chunk) elif self.owner.compression == lexicon.AFF4_IMAGE_COMPRESSION_STORED: compressed_chunk = chunk compressedLen = len(compressed_chunk) self.chunk_count_in_bevy += 1 if compressedLen < self.owner.chunk_size - 16: self.bevy_index.append((self.bevy_length, compressedLen)) self.bevy_length += compressedLen return compressed_chunk else: self.bevy_index.append((self.bevy_length, self.owner.chunk_size)) self.bevy_length += self.owner.chunk_size return chunk
def FlushChunk(self, chunk): if len(chunk) == 0: return bevy_offset = self.bevy_length if self.compression == lexicon.AFF4_IMAGE_COMPRESSION_ZLIB: compressed_chunk = zlib.compress(chunk) elif (snappy and self.compression == lexicon.AFF4_IMAGE_COMPRESSION_SNAPPY): compressed_chunk = snappy.compress(chunk) elif self.compression == lexicon.AFF4_IMAGE_COMPRESSION_STORED: compressed_chunk = chunk compressedLen = len(compressed_chunk) if compressedLen < self.chunk_size - 16: self.bevy_index.append((bevy_offset, compressedLen)) self.bevy.append(compressed_chunk) self.bevy_length += compressedLen else: self.bevy_index.append((bevy_offset, self.chunk_size)) self.bevy.append(chunk) self.bevy_length += self.chunk_size #self.bevy_index.append((bevy_offset, len(compressed_chunk))) #self.bevy.append(compressed_chunk) #self.bevy_length += len(compressed_chunk) self.chunk_count_in_bevy += 1 #self.buffer = chunk[self.chunk_size:] if self.chunk_count_in_bevy >= self.chunks_per_segment: self._FlushBevy()
def print_hi(name): # Use a breakpoint in the code line below to debug your script. print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint. print('-------------zlib---------------') message = 'hello' compressed = zlib.compress(message.encode()) decompressed = zlib.decompress(compressed) print('original:', repr(message)) print_hex(compressed) print('decompressed:', repr(decompressed)) # with open('1.txt', 'w', encoding='utf-8') as f: # python3 # f.write(compressed.decode(encoding="utf-8")) print('-------------gzip---------------') f_in = open("1.txt", "rb") # 打开文件 f_out = gzip.open("data.txt.gz", "wb") # 创建压缩文件对象 f_out.writelines(f_in) f_out.close() f_in.close() print('-------------snappy---------------') compressed = snappy.compress(message) print_hex(compressed) decompressed = snappy.uncompress(compressed) print('uncompressed:', repr(decompressed))
def _compress_event(self, event: BaseEvent) -> Union[BaseEvent, bytes]: if self.has_snappy_support: import snappy return cast(bytes, snappy.compress(pickle.dumps(event))) else: return event
def write_header(self): if not self.fp: self.fp = open(self.filename, 'w+b') fp = self.fp # header layout: # 5I: major, minor, part0_size, n_properties, n_sections # 64s * n_sections # packed properties n_sections = len(self.section_names) part0_size = 20 + 64 * n_sections props_size = self._measure_properties_pack_size(self.properties) header_len = part0_size + props_size header_buf = bytearray(header_len) n_properties = len(self.properties) struct.pack_into('5I', header_buf, 0, self.major, self.minor, part0_size, n_properties, n_sections) for n in range(0, n_sections): off = 20 + n * 64 struct.pack_into('64s', header_buf, off, self.section_names[n].encode('utf-8')) self._pack_properties(self.properties, header_buf, part0_size) fp.seek(0) # XXX: it's really pretty suprising that Python has no zero-copy way of # casting a bytearray to bytes or a readonly memoryview and you can't # fake it by subclassing the builtin memoryview class. We have to pass # in a read-only bytes like object here which means we have to incur a # redundant copy :( compressed = snappy.compress(bytes(header_buf)) compressed_len = len(compressed) if compressed_len > self.frames_offset: # If this is a new file then there's no inherent limit to the # size of the header, but otherwise we can't grew beyond the # the space allocated when the file was first written: if self._is_empty: while compressed_len < self.frames_offset: self.frames_offset *= 2 else: raise Exception("header too long") fp.write( struct.pack('4sII', 'P4cK'.encode('ascii'), self.frames_offset, compressed_len)) fp.write(compressed) if fp.tell() < self.frames_offset: fp.seek(self.frames_offset - 1) fp.write(b' ')
def compress(self): if self._compressed != None: return self._compressed if self._decompressed == None: return None self._compressed = snappy.compress(self._decompressed) return self._compressed
def add_image(self, ts, img): """ Add depth image to archive """ # write time stamp to file self._fp_ts.write('%f\n' % ts) self._fp_ts.flush() # add frame to binary file #if img.dtype == np.float32: # Is simulated data #mask = np.isnan(img) #if mask.any(): # In simulation, the background has NaN depth values. # We replace them with 0 m, similar to what the Kinect V1 did. # See https://msdn.microsoft.com/en-us/library/jj131028.aspx. #rospy.logdebug("There was at least one NaN in the depth image. " + # "I replaced all occurrences with 0.0 m.") #img.flags.writeable = True #img[mask] = 0.0 # We now map the float values in meters to uint16 values in mm # as provided by the libfreenect2 library and Kinect SDK. #img *= 1000.0 #img = img.astype(np.uint16, copy=False) #assert img.dtype == np.uint16 # compress image with snappy img_comp = snappy.compress(img) # write number of bytes of compressed image nr_bytes = struct.pack('<L', len(img_comp)) self._fp_d.write(nr_bytes) # write compressed image self._fp_d.write(img_comp) self._fp_d.flush()
def process_raw(self, evidence_uuid, pipeline, data, raw, autosave=True, wait_for_result=False): """Process the raw data using the given pipeline. @param evidence_uuid: Evidence id @param pipeline: Name of pipeline to use @param data: Current information about the stream @param raw: Raw data to process @param autosave: Auto passes result onto the engine @param wait_for_result: Wait to get the resulting data @returns: result data """ raw = base64.b64encode(snappy.compress(raw)) if wait_for_result: return self._client.call({}, 'process_raw', evidence_uuid=evidence_uuid, pipeline=pipeline, data=data, raw=raw, return_result=wait_for_result, autosave=autosave) self._client.cast({}, 'process_raw', evidence_uuid=evidence_uuid, pipeline=pipeline, data=data, raw=raw, return_result=wait_for_result, autosave=autosave)
def snappy_write_block(encoder, block_bytes): """Write block in "snappy" codec.""" data = snappy.compress(block_bytes) encoder.write_long(len(data) + 4) # for CRC encoder._fo.write(data) encoder.write_crc32(block_bytes)
def compress(data): """ Compresses given data via the snappy algorithm. The result is preceded with a header containing the string 'SNAPPY' and the default and min-compat versions (both ``1``). The block size for the compression is hard-coded at 32kb. If ``python-snappy`` is not installed a ``RuntimeError`` is raised. """ if not snappy_available: raise RuntimeError("Snappy compression unavailable.") buff = BytesIO() buff.write(raw_header) for block_num in range(0, len(data), BLOCK_SIZE): block = data[block_num:block_num + BLOCK_SIZE] compressed = snappy.compress(block) buff.write(struct.pack("!i", len(compressed))) buff.write(compressed) result = buff.getvalue() buff.close() return result
def _pack_msgpack_snappy(obj): # print "pack", obj tmp = msgpack.dumps(obj, encoding='utf-8') if len(tmp) > 1000: return b'S' + snappy.compress(tmp) else: return b'\0' + tmp
def compress(data: bytes): """ Compresses data with default encoding UTF-8. We currently use Google´s Snappy (it´s fast). :param data: bytes that are going to be compressed. :return: bytes compressed by algorithm. """ return snappy.compress(data, 'utf-8')
def encode(self, serializer: AbstractSerializer, compress: bool = False) \ -> RawHeaderBody: metadata = b'' if self.metadata is not None: metadata = self.metadata.encode() header = { 'type': int(self.msgtype), 'meth': self.method, 'okey': self.order_key, 'seq': self.client_seq_id, 'zip': compress, } serialized_header: bytes = mpackb(header) body: Optional[bytes] if self.msgtype in (RPCMessageTypes.FUNCTION, RPCMessageTypes.RESULT): body = serializer(self.body) else: body = self.body data = { 'meta': metadata, 'body': body, } serialized_data: bytes = mpackb(data) if compress: if not has_snappy: raise ConfigurationError('python-snappy is not installed') serialized_data = snappy.compress(serialized_data) return RawHeaderBody( serialized_header, serialized_data, self.peer_id, )
def _pack(obj) : # print "PACK", obj tmp = msgpack.dumps(obj) if len(tmp) > 1000: return 'S' + snappy.compress(tmp) else: return '\0' + tmp
def new_fn(self, arg, *args, **kw): try: date = arg.name except AttributeError: date = arg if validate_date: assert DATE_RE.match(date), date fnam = hashlib.md5((source_id + date).encode('utf-8')).hexdigest() cache_path = get_cache_dir() / source_id / f'{fnam}.json' if not cache_path.parent.exists(): cache_path.parent.mkdir() if cache_path.exists(): with open(cache_path, 'rb') as f: datapoints = json.loads(snappy.decompress(f.read())) return [_DataPoint(*i) for i in datapoints] else: datapoints = fn( self, date, *args, **kw) # WARNING: args/kw aren't taken into account here!! json_data = json.dumps([tuple(i) for i in datapoints]).encode('utf-8') json_data = snappy.compress(json_data) with open(cache_path, 'wb') as f: f.write(json_data) return datapoints
def snappy_write_block(fo, block_bytes): """Write block in "snappy" codec.""" data = snappy.compress(block_bytes) write_long(fo, len(data) + 4) # for CRC fo.write(data) write_crc32(fo, block_bytes)
def write_key(ds, kind, id, data_path): key = ds.key(kind, id) entity = datastore.Entity(key=key, exclude_from_indexes=['Value']) with open(data_path) as f: data = json.load(f) payload = { 'LastModified': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'SchemaVersion': '', 'DataType': data['dataType'], 'Season': data['season']['name'], 'Sport': data['sport'] if 'sport' in data else '', 'League': data['league']['alias'], 'TeamId': str(data['team']['id']), 'PlayerId': data['player']['id'] if 'player' in data else '', 'EventId': data['eventId'] if 'eventId' in data else '', 'EventDate': data['eventDate'] if 'eventDate' in data else '', 'EventType': data['eventType'] if 'eventType' in data else '', 'Value': snappy.compress(msgpack.packb(data)) } print payload entity.update(payload) ds.put(entity)
def snappy_write_block(fo, block_bytes): '''Write block in "snappy" codec.''' data = snappy.compress(block_bytes) write_long(fo, len(data) + 4) # for CRC fo.write(data) write_crc32(fo, block_bytes)
def run(self): while True: self.__RunningFlag = False try: PartitionID_ = self.__PendingTaskQueue.get(timeout=0.05) except: if self.__stop.is_set(): break else: continue self.__RunningFlag = True UpdatedVertex = self.__ControlInfo['CalcFunc'](PartitionID_, self.__DataInfo, self.__GraphInfo, self.__Dtype_All) start_id = PartitionID_ * self.__GraphInfo['VertexPerPartition'] end_id = (PartitionID_ + 1) * \ self.__GraphInfo['VertexPerPartition'] UpdatedVertex = UpdatedVertex - \ self.__DataInfo['VertexData'][start_id:end_id] UpdatedVertex[np.where( abs(UpdatedVertex) <= self.__ControlInfo['FilterThreshold'] )] = 0 UpdatedVertex = UpdatedVertex.astype( self.__Dtype_All['VertexData']) Tmp_UpdatedData = np.append(UpdatedVertex, PartitionID_) Tmp_UpdatedData = Tmp_UpdatedData.astype( self.__Dtype_All['VertexData']) Str_UpdatedData = Tmp_UpdatedData.tostring() Str_UpdatedData = snappy.compress(Str_UpdatedData) QueueUpdatedVertex.put(Str_UpdatedData)
def test_compression(self): # test that we can add compressed chunks compressor = snappy.StreamCompressor() data = b"\0" * 50 compressed_data = snappy.compress(data) crc = struct.pack("<L", snappy._masked_crc32c(data)) self.assertEqual(crc, b"\x8f)H\xbd") self.assertEqual(len(compressed_data), 6) self.assertEqual(compressor.add_chunk(data, compress=True), b"\xff\x06\x00\x00sNaPpY" b"\x00\x0a\x00\x00" + crc + compressed_data) # test that we can add uncompressed chunks data = b"\x01" * 50 crc = struct.pack("<L", snappy._masked_crc32c(data)) self.assertEqual(crc, b"\xb2\x14)\x8a") self.assertEqual(compressor.add_chunk(data, compress=False), b"\x01\x36\x00\x00" + crc + data) # test that we can add more data than will fit in one chunk data = b"\x01" * (snappy._CHUNK_MAX * 2 - 5) crc1 = struct.pack("<L", snappy._masked_crc32c(data[:snappy._CHUNK_MAX])) self.assertEqual(crc1, b"h#6\x8e") crc2 = struct.pack("<L", snappy._masked_crc32c(data[snappy._CHUNK_MAX:])) self.assertEqual(crc2, b"q\x8foE") self.assertEqual(compressor.add_chunk(data, compress=False), b"\x01\x04\x00\x01" + crc1 + data[:snappy._CHUNK_MAX] + b"\x01\xff\xff\x00" + crc2 + data[snappy._CHUNK_MAX:])
def encode_inform(config, data): iv = Random.new().read(16) key = MASTER_KEY if config.getboolean('gateway', 'is_adopted'): key = config.get('gateway', 'key') payload = None flags = 3 if 'snappy' in sys.modules: payload = snappy.compress(data) flags = 5 else: payload = zlib.compress(data) pad_len = AES.block_size - (len(payload) % AES.block_size) payload += chr(pad_len) * pad_len payload = AES.new(a2b_hex(key), AES.MODE_CBC, iv).encrypt(payload) mac = config.get('gateway', 'lan_mac') encoded_data = 'TNBU' # magic encoded_data += pack('>I', 1) # packet version encoded_data += pack('BBBBBB', *(mac_string_2_array(mac))) encoded_data += pack('>H', flags) # flags encoded_data += iv # encryption iv encoded_data += pack('>I', 1) # payload version encoded_data += pack('>I', len(payload)) # payload length encoded_data += payload return encoded_data
def _encode(self, data): try: import snappy except ImportError: _print_import_error() raise return snappy.compress(data)
def save(self, data): compressed_data = snappy.compress(data) encrypted_data = self.encrypter.encrypt_data(compressed_data) questionnaire_state = QuestionnaireState( self._user_id, encrypted_data, QuestionnaireStore.LATEST_VERSION) current_app.eq["storage"].put(questionnaire_state)
def compress(compression_scheme, compression_level, data, compressor_context): if compression_scheme == 0: # zlib return zlib.compress(data, compression_level) elif compression_scheme == 1: # zstd return compressor_context.compress(data) elif compression_scheme == 2: # lz4 return lz4.frame.compress(data, compression_level=compression_level, store_size=False) elif compression_scheme == 3: # snappy return snappy.compress(data) elif compression_scheme == 4: # bzip return bz2.compress(data, compresslevel=compression_level) elif compression_scheme == 5: # lzma return lzma.compress(data, preset=compression_level) elif compression_scheme == 6: # blosc_zlib return blosc.compress(data, clevel=compression_level, cname='zlib', shuffle=blosc.BITSHUFFLE) elif compression_scheme == 7: # blosc_zstd return blosc.compress(data, clevel=compression_level, cname='zstd', shuffle=blosc.BITSHUFFLE) elif compression_scheme == 8: # blosc_lz4 return blosc.compress(data, clevel=compression_level, cname='lz4', shuffle=blosc.BITSHUFFLE) elif compression_scheme == 9: # blosc_snappy return blosc.compress(data, clevel=compression_level, cname='snappy', shuffle=blosc.BITSHUFFLE) elif compression_scheme == 10: # blosclz return blosc.compress(data, clevel=compression_level, cname='blosclz', shuffle=blosc.BITSHUFFLE) elif compression_scheme == 11: # blosc_lz4hc return blosc.compress(data, clevel=compression_level, cname='lz4hc', shuffle=blosc.BITSHUFFLE) else: raise NotImplementedError('compression scheme not implemented')
def rewrite(data_string): data=json.loads(data_string) toupdate=json.loads(update) #primary_key_modified=False #delete the appropriate document query=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) #modify the values for key,value in toupdate.items(): #if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!) if add_field_if_not_exists==False: if key in data.keys(): data[key]=value else: data[key]=value #this deletion statement has been intenstionally added here #only if the modified data,has primary keys already not existing,will the updating process continue primary_key_update=False for key in toupdate.keys(): if key in primary_keys_map: primary_key_update=True break if primary_key_update == True: query_search=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key]) query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST)) hits=searcher.search(query_search,MAX_RESULTS).scoreDocs if len(hits) > 0: return 106 writer.deleteDocuments(query) #add the newly modified document doc=Document() #index files wrt primary key for primary_key in primary_keys_map: try: field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED) doc.add(field) except: # primary_keys_map.pop(collection_name) return 101 #compress data using snappy if compression is on if to_be_compressed_input==True: temp=json.dumps(data) data_string=base64.b64encode(snappy.compress(temp)) else: temp=json.dumps(data) data_string=base64.b64encode(temp) field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc)
def snappy_encode(payload, xerial_compatible=False, xerial_blocksize=32 * 1024): """ Compress the given data with the Snappy algorithm. :param bytes payload: Data to compress. :param bool xerial_compatible: If set then the stream is broken into length-prefixed blocks in a fashion compatible with the xerial snappy library. The format winds up being:: +-------------+------------+--------------+------------+--------------+ | Header | Block1_len | Block1 data | BlockN len | BlockN data | |-------------+------------+--------------+------------+--------------| | 16 bytes | BE int32 | snappy bytes | BE int32 | snappy bytes | +-------------+------------+--------------+------------+--------------+ :param int xerial_blocksize: Number of bytes per chunk to independently Snappy encode. 32k is the default in the xerial library. :returns: Compressed bytes. :rtype: :class:`bytes` """ if not has_snappy(): # FIXME This should be static, not checked every call. raise NotImplementedError("Snappy codec is not available") if xerial_compatible: def _chunker(): for i in range(0, len(payload), xerial_blocksize): yield payload[i:i+xerial_blocksize] out = BytesIO() out.write(_XERIAL_HEADER) for chunk in _chunker(): block = snappy.compress(chunk) out.write(struct.pack('!i', len(block))) out.write(block) out.seek(0) return out.read() else: return snappy.compress(payload)
def compress(data): meta = { "compression": "snappy", "orig_size": len(data) } compressed_data = snappy.compress(data) return meta, compressed_data
def test_view_snappy_compressed(self): if not snappy_installed(): raise SkipTest import snappy cluster = pseudo_hdfs4.shared_cluster() finish = [] try: prefix = self.cluster.fs_prefix + '/test_view_snappy_compressed' self.self.cluster.fs.mkdir(prefix) f = cluster.fs.open(prefix + '/test-view.snappy', "w") f.write(snappy.compress('This is a test of the emergency broadcasting system.')) f.close() f = cluster.fs.open(prefix + '/test-view.stillsnappy', "w") f.write(snappy.compress('The broadcasters of your area in voluntary cooperation with the FCC and other authorities.')) f.close() f = cluster.fs.open(prefix + '/test-view.notsnappy', "w") f.write('foobar') f.close() # Snappy compressed fail response = c.get('/filebrowser/view=%s/test-view.notsnappy?compression=snappy' % prefix) assert_true('Failed to decompress' in response.context['message'], response) # Snappy compressed succeed response = c.get('/filebrowser/view=%s/test-view.snappy' % prefix) assert_equal('snappy', response.context['view']['compression']) assert_equal(response.context['view']['contents'], 'This is a test of the emergency broadcasting system.', response) # Snappy compressed succeed response = c.get('/filebrowser/view=%s/test-view.stillsnappy' % prefix) assert_equal('snappy', response.context['view']['compression']) assert_equal(response.context['view']['contents'], 'The broadcasters of your area in voluntary cooperation with the FCC and other authorities.', response) # Largest snappy compressed file finish.append( MAX_SNAPPY_DECOMPRESSION_SIZE.set_for_testing(1) ) response = c.get('/filebrowser/view=%s/test-view.stillsnappy?compression=snappy' % prefix) assert_true('File size is greater than allowed max snappy decompression size of 1' in response.context['message'], response) finally: for done in finish: done()
def pack(data): if SNAPPY_ENABLED: data = snappy.compress( pyarrow.serialize(data).to_buffer().to_pybytes()) # TODO(ekl) we shouldn't need to base64 encode this data, but this # seems to not survive a transfer through the object store if we don't. return base64.b64encode(data) else: return data
def snappy_write_block(stream, block_bytes): """Write a block of bytes wih the 'snappy' codec.""" data = snappy.compress(block_bytes) # Add 4 bytes for the CRC32 write_long(stream, len(data) + 4) stream.write(data) # Write the 4-byte, big-endian CRC32 checksum crc = crc32(block_bytes) & 0xFFFFFFFF stream.write(pack('>I', crc))
def transform(self, buf): for trans_id in self.__write_transforms: if trans_id == self.ZLIB_TRANSFORM: buf = zlib.compress(buf) elif trans_id == self.SNAPPY_TRANSFORM: buf = snappy.compress(buf) else: raise TTransportException(TTransportException.INVALID_TRANSFORM, "Unknown transform during send") return buf
def _get(key, callback, args): r = cache_table.find_one({'_id': key}) if not r: content = callback(*args) data = bson.binary.Binary(snappy.compress(content)) cache_table.insert_one({'_id': key, 'data': data}) else: data = r['data'] content = snappy.decompress(data) return content
def enqueue(self, obj): data = pickle.dumps(obj) if getattr(self, "_debug_pickle", False): import objgraph restored = pickle.loads(data) objgraph.show_refs(restored, too_many=40) data = snappy.compress(data) self.debug("Broadcasting %d bytes" % len(data)) zmq_connection = getattr(self, "zmq_connection") if zmq_connection is not None: zmq_connection.send(data)
def __compress(self, event): original_event=event event['data']=snappy.compress(event['data']) event['header']['snappy']=True self.logging.debug("Incoming data compressed.") try: self.queuepool.outbox.put(event) except QueueLocked: self.queuepool.inbox.rescue(original_event) self.queuepool.outbox.waitUntilPutAllowed()
def fset(self, inst, value): nprow = getattr(inst, 'NumpyArrayTable__'+self.name) #~ print 'fset',self.name, nprow, value if nprow is None: nprow = self.NumpyArrayTableClass() setattr(inst, 'NumpyArrayTable__'+self.name, nprow) if value is None: if hasattr(inst, self.name+'_array') : delattr(inst, self.name+'_array') nprow.shape = None nprow.dtype = None nprow.blob = None nprow.units = None nprow.compress = None return if self.arraytype == np.ndarray: assert (type(value) == np.ndarray) or (type(value) == np.memmap) , 'Value is not np.array or np.memmap but {}'.format(type(value)) if self.arraytype == pq.Quantity: assert type(value) == pq.Quantity , '{} {} {} value is not pq.Quantity'.format(inst.__class__.__name__, self.name, value) shape = ('{},'*value.ndim)[:-1].format(*value.shape) if shape.endswith(',') : shape = shape[:-1] nprow.shape = shape nprow.dtype = value.dtype.str if self.compress == 'blosc': blob = blosc.compress(value.tostring(), typesize = value.dtype.itemsize, clevel= 9) else: if not value.flags['C_CONTIGUOUS']: #~ buf = np.getbuffer(np.array(value, copy = True)) buf = np.array(value, copy=True).data else: #~ buf = np.getbuffer(value) buf = value.data if self.compress == 'zlib': blob = zlib.compress(buf) elif self.compress == 'lz4': blob = lz4.compress(buf) elif self.compress == 'snappy': blob = snappy.compress(buf) else : blob = buf nprow.compress = self.compress nprow.blob = blob if self.arraytype == pq.Quantity: nprow.units = value.dimensionality.string setattr(inst, self.name+'_array', value)
def test_decompression(self): # test that we check for the initial stream identifier data = b"\x01" * 50 self.assertRaises(snappy.UncompressError, snappy.StreamDecompressor().decompress, b"\x01\x36\x00\00" + struct.pack("<L", snappy._masked_crc32c(data)) + data) self.assertEqual( snappy.StreamDecompressor().decompress( b"\xff\x06\x00\x00sNaPpY" b"\x01\x36\x00\x00" + struct.pack("<L", snappy._masked_crc32c(data)) + data), data) decompressor = snappy.StreamDecompressor() decompressor.decompress(b"\xff\x06\x00\x00sNaPpY") self.assertEqual( decompressor.copy().decompress( b"\x01\x36\x00\x00" + struct.pack("<L", snappy._masked_crc32c(data)) + data), data) # test that we throw errors for unknown unskippable chunks self.assertRaises(snappy.UncompressError, decompressor.copy().decompress, b"\x03\x01\x00\x00") # test that we skip unknown skippable chunks self.assertEqual(b"", decompressor.copy().decompress(b"\xfe\x01\x00\x00")) # test that we check CRCs compressed_data = snappy.compress(data) real_crc = struct.pack("<L", snappy._masked_crc32c(data)) fake_crc = os.urandom(4) self.assertRaises(snappy.UncompressError, decompressor.copy().decompress, b"\x00\x0a\x00\x00" + fake_crc + compressed_data) self.assertEqual( decompressor.copy().decompress( b"\x00\x0a\x00\x00" + real_crc + compressed_data), data) # test that we buffer when we don't have enough uncompressed_data = os.urandom(100) compressor = snappy.StreamCompressor() compressed_data = (compressor.compress(uncompressed_data[:50]) + compressor.compress(uncompressed_data[50:])) for split1 in range(len(compressed_data) - 1): for split2 in range(split1, len(compressed_data)): decompressor = snappy.StreamDecompressor() self.assertEqual( (decompressor.decompress(compressed_data[:split1]) + decompressor.decompress(compressed_data[split1:split2]) + decompressor.decompress(compressed_data[split2:])), uncompressed_data)
def Compress(Input): Output = Input + '.snappy' file_in = file(Input, "rb") data = file_in.read() file_out = file(Output, "wb") c_data = snappy.compress(data) file_out.write(c_data) file_out.close() file_in.close()
def transform(self, buf): for trans_id in self.__write_transforms: if trans_id == TRANSFORM.ZLIB: buf = zlib.compress(buf) elif trans_id == TRANSFORM.SNAPPY: buf = snappy.compress(buf) elif trans_id == TRANSFORM.ZSTD: buf = zstd.ZstdCompressor(write_content_size=True).compress(buf) else: raise TTransportException(TTransportException.INVALID_TRANSFORM, "Unknown transform during send") return buf
def _get(url, callback, *args): key = get_sha1_key(url) r = cache_table.find_one({'_id': key}) if not r: throttle.run() r = requests.get(url) content = callback(r, *args) data = bson.binary.Binary(snappy.compress(content)) cache_table.insert_one({'_id': key, 'data': data}) else: data = r['data'] content = snappy.decompress(data) return content
def _WriteBlock(self): if not self._header_written: self._WriteHeader() if self.block_count <= 0: logger.info('Current block is empty, nothing to write.') return # write number of items in block self.encoder.write_long(self.block_count) # write block contents uncompressed_data = self._buffer_writer.getvalue() codec = self.GetMeta(CODEC_KEY).decode('utf-8') if codec == 'null': compressed_data = uncompressed_data compressed_data_length = len(compressed_data) elif codec == 'deflate': # The first two characters and last character are zlib # wrappers around deflate data. compressed_data = zlib.compress(uncompressed_data)[2:-1] compressed_data_length = len(compressed_data) elif codec == 'snappy': compressed_data = snappy.compress(uncompressed_data) compressed_data_length = len(compressed_data) + 4 # crc32 else: fail_msg = '"%s" codec is not supported.' % codec raise DataFileException(fail_msg) # Write length of block self.encoder.write_long(compressed_data_length) # Write block self.writer.write(compressed_data) # Write CRC32 checksum for Snappy if self.GetMeta(CODEC_KEY) == 'snappy': self.encoder.write_crc32(uncompressed_data) # write sync marker self.writer.write(self.sync_marker) logger.debug( 'Writing block with count=%d nbytes=%d sync=%r', self.block_count, compressed_data_length, self.sync_marker) # reset buffer self._buffer_writer.seek(0) self._buffer_writer.truncate() self._block_count = 0
def FlushChunk(self, chunk): bevy_offset = self.bevy_length if self.compression == lexicon.AFF4_IMAGE_COMPRESSION_ZLIB: compressed_chunk = zlib.compress(chunk) elif snappy and self.compression == lexicon.AFF4_IMAGE_COMPRESSION_SNAPPY: compressed_chunk = snappy.compress(chunk) elif self.compression == lexicon.AFF4_IMAGE_COMPRESSION_STORED: compressed_chunk = chunk self.bevy_index.append(bevy_offset) self.bevy.append(compressed_chunk) self.bevy_length += len(compressed_chunk) self.chunk_count_in_bevy += 1 if self.chunk_count_in_bevy >= self.chunks_per_segment: self._FlushBevy()
def worker(q): while 1: fn_data = q.get() st = time.time() if fn_data is None: break orig = comp = 0 fn, data = fn_data if fn.endswith('.jpg') or fn.endswith('.ppm'): cv2.imwrite(fn, data) elif fn.endswith('.npy'): np.save(fn, data) elif fn.endswith('.snappy'): data_snappy = snappy.compress(data) open(fn, 'w').write(data_snappy) orig = len(data) comp = len(data_snappy) print('Size[%d] Fn[%s] Time[%f] Orig[%d] Comp[%d]' % (q.qsize(), os.path.basename(fn), time.time() - st, orig, comp))