def test_compress_decompress(): INPUT_SIZE = 10000 test_data = (np.random.randint(0, 255, size=INPUT_SIZE).astype( np.uint8).tostring()) test_buf = pa.frombuffer(test_data) codecs = ['lz4', 'snappy', 'gzip', 'zstd', 'brotli'] for codec in codecs: compressed_buf = pa.compress(test_buf, codec=codec) compressed_bytes = pa.compress(test_data, codec=codec, asbytes=True) assert isinstance(compressed_bytes, bytes) decompressed_buf = pa.decompress(compressed_buf, INPUT_SIZE, codec=codec) decompressed_bytes = pa.decompress(compressed_bytes, INPUT_SIZE, codec=codec, asbytes=True) assert isinstance(decompressed_bytes, bytes) assert decompressed_buf.equals(test_buf) assert decompressed_bytes == test_data with pytest.raises(ValueError): pa.decompress(compressed_bytes, codec=codec)
def test_compress_decompress(): INPUT_SIZE = 10000 test_data = (np.random.randint(0, 255, size=INPUT_SIZE) .astype(np.uint8) .tostring()) test_buf = pa.py_buffer(test_data) codecs = ['lz4', 'snappy', 'gzip', 'zstd', 'brotli'] for codec in codecs: compressed_buf = pa.compress(test_buf, codec=codec) compressed_bytes = pa.compress(test_data, codec=codec, asbytes=True) assert isinstance(compressed_bytes, bytes) decompressed_buf = pa.decompress(compressed_buf, INPUT_SIZE, codec=codec) decompressed_bytes = pa.decompress(compressed_bytes, INPUT_SIZE, codec=codec, asbytes=True) assert isinstance(decompressed_bytes, bytes) assert decompressed_buf.equals(test_buf) assert decompressed_bytes == test_data with pytest.raises(ValueError): pa.decompress(compressed_bytes, codec=codec)
def test_compress_decompress(compression): if not Codec.is_available(compression): pytest.skip("{} support is not built".format(compression)) INPUT_SIZE = 10000 test_data = (np.random.randint(0, 255, size=INPUT_SIZE).astype( np.uint8).tostring()) test_buf = pa.py_buffer(test_data) compressed_buf = pa.compress(test_buf, codec=compression) compressed_bytes = pa.compress(test_data, codec=compression, asbytes=True) assert isinstance(compressed_bytes, bytes) decompressed_buf = pa.decompress(compressed_buf, INPUT_SIZE, codec=compression) decompressed_bytes = pa.decompress(compressed_bytes, INPUT_SIZE, codec=compression, asbytes=True) assert isinstance(decompressed_bytes, bytes) assert decompressed_buf.equals(test_buf) assert decompressed_bytes == test_data with pytest.raises(ValueError): pa.decompress(compressed_bytes, codec=compression)
def setRedis(keyname,data,sessionID): inData = pa.serialize(data).to_buffer() compressLength = len(inData) inDataCompress= pa.compress(inData,asbytes=True) inDataDict = {'compressLength':compressLength,'inDataCompress':inDataCompress} keyDict = {'key':f"{keyname}Cache{sessionID}"} redis.hmset(keyDict['key'],inDataDict)
def test_download_result(self, stub): job = Job( job_pb2.Job( id="foo", status=job_pb2.STATUS_SUCCESS, error=job_pb2.JobError(code=errors_pb2.ERROR_NONE), )) result = {} buffer = pa.serialize(result, context=serialization_context).to_buffer() codec = "lz4" responses.add( responses.GET, Job.BUCKET_PREFIX.format(job.id), body=pa.compress(buffer, codec=codec, asbytes=True), headers={ "x-goog-meta-codec": codec, "x-goog-meta-decompressed_size": str(len(buffer)), }, status=200, ) assert job._download_result() == result
def object2proto(obj: pd.Series) -> PandasSeries_PB: """Convert pd.Series to PandasDataFrame_PB with pyarrow. Args: obj: target Series Returns: Serialized version of Series, which will be used to reconstruction. """ # https://arrow.apache.org/docs/python/pandas.html # series must either be converted to a dataframe or use pa.Array # however pa.Array mentions you must account for the null values yourself dataframe = obj.to_frame() schema = pa.Schema.from_pandas(dataframe) table = pa.Table.from_pandas(dataframe) sink = pa.BufferOutputStream() writer = pa.ipc.new_file(sink, schema) writer.write(table) writer.close() buf = sink.getvalue() siz = len(buf) df_bytes = pa.compress(buf, asbytes=True) return PandasSeries_PB(series=df_bytes, decompressed_size=siz)
def f(val): # first serialize the data buf = pa.serialize(val, context=context).to_buffer() if compress: original_len = len(buf) # compress the data buf = pa.compress(buf, codec=CODEC, asbytes=True) # add metadata required for decompression return pa.serialize((original_len, CODEC, buf)).to_buffer() else: return buf
def test_download_result(self, stub): job = Job._from_proto( job_pb2.Job( id="foo", state=job_pb2.Job.State(stage=job_pb2.Job.Stage.SUCCEEDED), ) ) result = {} buffer = pa.serialize(result, context=serialization_context).to_buffer() codec = "lz4" responses.add( responses.GET, Job.BUCKET_PREFIX.format(job.id), body=pa.compress(buffer, codec=codec, asbytes=True), headers={ "x-goog-meta-codec": codec, "x-goog-meta-decompressed_size": str(len(buffer)), }, status=200, ) assert job._download_result() == result
def object2proto(obj: pd.DataFrame) -> PandasDataFrame_PB: """Convert pd.DataFrame to PandasDataFrame_PB with pyarrow. Args: obj: target Dataframe Returns: Serialized version of Dataframe, which will be used to reconstruction. """ schema = pa.Schema.from_pandas(obj) table = pa.Table.from_pandas(obj) sink = pa.BufferOutputStream() writer = pa.ipc.new_file(sink, schema) writer.write(table) writer.close() buf = sink.getvalue() siz = len(buf) df_bytes = pa.compress(buf, asbytes=True) return PandasDataFrame_PB(dataframe=df_bytes, decompressed_size=siz)
def write_time_series_cache_to_disk( self, fname, data_frame, engine='hdf5_fixed', append_data=False, db_server=constants.db_server, db_port=constants.db_port, username=constants.db_username, password=constants.db_password, filter_out_matching=None, timeout=10, use_cache_compression=constants.use_cache_compression, parquet_compression=constants.parquet_compression, md_request=None, ticker=None): """Writes Pandas data frame to disk as HDF5 format or bcolz format or in Arctic Parmeters --------- fname : str path of file data_frame : DataFrame data frame to be written to disk engine : str 'hdf5_fixed' - use HDF5 fixed format, very quick, but cannot append to this 'hdf5_table' - use HDF5 table format, slower but can append to 'parquet' - use Parquet 'arctic' - use Arctic/MongoDB database 'redis' - use Redis append_data : bool False - write a fresh copy of data on disk each time True - append data to disk db_server : str Database server for arctic (default: '127.0.0.1') timeout : int Number of seconds to do timeout """ logger = LoggerManager().getLogger(__name__) if md_request is not None: fname = self.path_join( fname, md_request.create_category_key(ticker=ticker)) # default HDF5 format hdf5_format = 'fixed' if 'hdf5' in engine: hdf5_format = engine.split('_')[1] engine = 'hdf5' if (engine == 'bcolz'): # convert invalid characters to substitutes (which Bcolz can't deal with) data_frame.columns = self.find_replace_chars( data_frame.columns, _invalid_chars, _replace_chars) data_frame.columns = ['A_' + x for x in data_frame.columns] data_frame['DTS_'] = pandas.to_datetime(data_frame.index, unit='ns') bcolzpath = self.get_bcolz_filename(fname) shutil.rmtree(bcolzpath, ignore_errors=True) zlens = bcolz.ctable.fromdataframe(data_frame, rootdir=bcolzpath) elif (engine == 'redis'): fname = os.path.basename(fname).replace('.', '_') # Will fail if Redis is not installed try: r = redis.StrictRedis(host=db_server, port=db_port, db=0, socket_timeout=timeout, socket_connect_timeout=timeout) ping = r.ping() # If Redis is alive, try pushing to it if ping: if data_frame is not None: if isinstance(data_frame, pandas.DataFrame): mem = data_frame.memory_usage(deep='deep').sum() mem_float = round( float(mem) / (1024.0 * 1024.0), 3) if mem_float < 500: # msgpack/blosc is deprecated # r.set(fname, data_frame.to_msgpack(compress='blosc')) # now uses pyarrow context = pa.default_serialization_context() ser = context.serialize(data_frame).to_buffer() if use_cache_compression: comp = pa.compress(ser, codec='lz4', asbytes=True) siz = len(ser) # siz = 3912 r.set('comp_' + str(siz) + '_' + fname, comp) else: r.set(fname, ser.to_pybytes()) logger.info("Pushed " + fname + " to Redis") else: logger.warn("Did not push " + fname + " to Redis, given size") else: logger.info("Object " + fname + " is empty, not pushed to Redis.") else: logger.warning("Didn't push " + fname + " to Redis given not running") except Exception as e: logger.warning("Couldn't push " + fname + " to Redis: " + str(e)) elif (engine == 'arctic'): from arctic import Arctic import pymongo socketTimeoutMS = 30 * 1000 fname = os.path.basename(fname).replace('.', '_') logger.info('Load Arctic/MongoDB library: ' + fname) if username is not None and password is not None: c = pymongo.MongoClient( host="mongodb://" + username + ":" + password + "@" + str(db_server) + ":" + str(db_port), connect=False) # , username=username, password=password) else: c = pymongo.MongoClient(host="mongodb://" + str(db_server) + ":" + str(db_port), connect=False) store = Arctic(c, socketTimeoutMS=socketTimeoutMS, serverSelectionTimeoutMS=socketTimeoutMS, connectTimeoutMS=socketTimeoutMS) database = None try: database = store[fname] except: pass if database is None: store.initialize_library(fname, audit=False) logger.info("Created MongoDB library: " + fname) else: logger.info("Got MongoDB library: " + fname) # Access the library library = store[fname] if ('intraday' in fname): data_frame = data_frame.astype('float32') if filter_out_matching is not None: cols = data_frame.columns new_cols = [] for col in cols: if filter_out_matching not in col: new_cols.append(col) data_frame = data_frame[new_cols] # Problems with Arctic when writing timezone to disk sometimes, so strip data_frame = data_frame.copy().tz_localize(None) try: # Can duplicate values if we have existing dates if append_data: library.append(fname, data_frame) else: library.write(fname, data_frame) c.close() logger.info("Written MongoDB library: " + fname) except Exception as e: logger.warning("Couldn't write MongoDB library: " + fname + " " + str(e)) elif (engine == 'hdf5'): h5_filename = self.get_h5_filename(fname) # append data only works for HDF5 stored as tables (but this is much slower than fixed format) # removes duplicated entries at the end if append_data: store = pandas.HDFStore(h5_filename, format=hdf5_format, complib="zlib", complevel=9) if ('intraday' in fname): data_frame = data_frame.astype('float32') # get last row which matches and remove everything after that (because append # function doesn't check for duplicated rows nrows = len(store['data'].index) last_point = data_frame.index[-1] i = nrows - 1 while (i > 0): read_index = store.select('data', start=i, stop=nrows).index[0] if (read_index <= last_point): break i = i - 1 # remove rows at the end, which are duplicates of the incoming time series store.remove(key='data', start=i, stop=nrows) store.put(key='data', value=data_frame, format=hdf5_format, append=True) store.close() else: h5_filename_temp = self.get_h5_filename(fname + ".temp") # delete the old copy try: os.remove(h5_filename_temp) except: pass store = pandas.HDFStore(h5_filename_temp, complib="zlib", complevel=9) if ('intraday' in fname): data_frame = data_frame.astype('float32') store.put(key='data', value=data_frame, format=hdf5_format) store.close() # delete the old copy try: os.remove(h5_filename) except: pass # once written to disk rename os.rename(h5_filename_temp, h5_filename) logger.info("Written HDF5: " + fname) elif (engine == 'parquet'): if '.parquet' not in fname: if fname[-5:] != '.gzip': fname = fname + '.parquet' self.to_parquet(data_frame, fname, aws_region=constants.aws_region, parquet_compression=parquet_compression) # data_frame.to_parquet(fname, compression=parquet_compression) logger.info("Written Parquet: " + fname) elif engine == 'csv': if '.csv' not in fname: fname = fname + '.csv' data_frame.to_csv(fname) logger.info("Written CSV: " + fname)
def test_compress(buf, codec=None): return pa.compress(buf)
with open(test_file) as fin: raw_dict = json.load(fin) raw_dict = {'hits': raw_dict['hits']['hits'] * 1} batch = pa.Table.from_pydict(raw_dict) # logging.info("arrow 序列化反序列化:") # buf = test_serialize(batch) # data = test_deserialize(buf) raw_bytes = str.encode(json.dumps(raw_dict)) print("raw txt: ", len(raw_bytes)) print("Table: ", batch.nbytes) buf = pa.serialize(batch).to_buffer() print("serialize buf: ", len(buf.to_pybytes())) com_buf = pa.compress(buf, codec='gzip') com_txt = pa.compress(raw_bytes, codec='gzip') print("compressed raw txt", len(com_txt.to_pybytes())) print("compress buf: ", len(com_buf.to_pybytes())) print(buf.to_pybytes()) print(raw_bytes) # array = batch.to_batches()[0][0] # field = ['_id', '_index', '_score', '_source', '_type'] # sum = 0 # array_size = 0 # for i in field: # tmp = array.field(i) # print('-'*50) # print(i, tmp.nbytes)
def convert_python_to_binary(self, obj, key): """ Parameters ---------- obj : DataFrame (or Figure) Object to serialize key : str Key to store object Returns ------- binary, str """ if obj is None: return None # For pandas DataFrames if '_df' in key and isinstance(obj, pd.DataFrame): obj_list = self._chunk_dataframes( obj, chunk_size_mb=constants. volatile_cache_redis_max_cache_chunk_size_mb) # If compression has been specified (recommended!) if '_comp' in key: if constants.volatile_cache_redis_format == 'msgpack': for i in range(0, len(obj_list)): if obj_list[i] is not None: obj_list[i] = obj_list[i].to_msgpack( compress=constants. volatile_cache_redis_compression[ constants.volatile_cache_redis_format]) elif constants.volatile_cache_redis_format == 'arrow': # Set the size of each compressed object, so can read back later # eg. key might be xxxx_size_354534_size_345345_endsize etc. # Ignore bit before first '_size_' and after '_endsize' for i in range(0, len(obj_list)): if obj_list[i] is not None: ser = context.serialize(obj_list[i]).to_buffer() obj_list[i] = pa.compress( ser, codec=constants. volatile_cache_redis_compression[ constants.volatile_cache_redis_format], asbytes=True) key = key + '_size_' + str(len(ser)) key = key + '_endsizearrow_' else: raise Exception("Invalid volatile cache format specified.") elif '_comp' not in key: if constants.volatile_cache_redis_format == 'msgpack': for i in range(0, len(obj_list)): if obj_list[i] is not None: obj_list[i] = obj_list[i].to_msgpack() elif constants.volatile_cache_redis_format == 'arrow': # context = pa.default_serialization_context() for i in range(0, len(obj_list)): if obj_list[i] is not None: obj_list[i] = context.serialize( obj_list[i]).to_buffer().to_pybytes() else: raise Exception("Invalid volatile cache format specified.") # For Plotly JSON style objects (assume these will fit in the cache, as they tend to used downsampled data) elif '_fig' in key: # print("--------------- Converting " + key) # print(obj) obj_list = [self._plotly_fig_2_json(obj)] else: obj_list = [obj] return obj_list, key
def _convert_python_to_binary(self, obj, key, convert_cache_handle=True): logger = LoggerManager.getLogger(__name__) if obj is None: return None # For pandas DataFrames if '_df' in key and isinstance(obj, pd.DataFrame): # if obj.empty: # return None obj_list = self._chunk_dataframes(obj) if '_comp' in key: if constants.volatile_cache_format == 'msgpack': # def to_msgpack(convert): # if convert is not None: # return convert.to_msgpack( # compress=constants.volatile_cache_compression[constants.volatile_cache_format]) # # return convert # # with PoolExecutor(max_workers=100) as executor: # obj_list = executor.map(to_msgpack, obj_list) for i in range(0, len(obj_list)): if obj_list[i] is not None: obj_list[i] = obj_list[i].to_msgpack( compress=constants.volatile_cache_compression[constants.volatile_cache_format]) elif constants.volatile_cache_format == 'arrow': # Get the size of each compressed object # eg. key might be xxxx_size_354534_size_345345_endsize etc. # Ignore bit before first '_size_' and after '_endsize' # context = pa.default_serialization_context() # def compress(convert): # if convert is not None: # ser = context.serialize(convert).to_buffer() # # convert = pa.compress(ser, # codec=constants.volatile_cache_compression[constants.volatile_cache_format], # asbytes=True) # # size = len(ser) # # return convert, size # # with PoolExecutor(max_workers=100) as executor: # obj_list, size_list = zip(*executor.map(compress, obj_list)) # # # obj_list, size_list = zip(*temp) # # for s in size_list: # key = key + '_size_' + str(s) for i in range(0, len(obj_list)): if obj_list[i] is not None: ser = context.serialize(obj_list[i]).to_buffer() obj_list[i] = pa.compress(ser, codec=constants.volatile_cache_compression[constants.volatile_cache_format], asbytes=True) key = key + '_size_' + str(len(ser)) key = key + '_endsize_' else: raise Exception("Invalid volatile cache format specified.") elif '_comp' not in key: if constants.volatile_cache_format == 'msgpack': for i in range(0, len(obj_list)): if obj_list[i] is not None: obj_list[i] = obj_list[i].to_msgpack() elif constants.volatile_cache_format == 'arrow': # context = pa.default_serialization_context() for i in range(0, len(obj_list)): if obj_list[i] is not None: obj_list[i] = context.serialize(obj_list[i]).to_buffer().to_pybytes() else: raise Exception("Invalid volatile cache format specified.") # For Plotly JSON style objects (assume these will fit in the cache, as they tend to used downsampled data) elif '_fig' in key: # print("--------------- Converting " + key) # print(obj) obj_list = [self._plotly_fig_2_json(obj)] elif isinstance(obj, CacheHandle) and convert_cache_handle: obj_list = self._convert_python_to_binary(self.get_dataframe_handle(obj, burn_after_reading=True), key) else: obj_list = [obj] return obj_list, key