def _pandas_to_bucket(df, symbol, initial_image): rtn = {SYMBOL: symbol, VERSION: CHUNK_VERSION_NUMBER, COLUMNS: {}, COUNT: len(df)} end = to_dt(df.index[-1].to_pydatetime()) if initial_image : if 'index' in initial_image: start = min(to_dt(df.index[0].to_pydatetime()), initial_image['index']) else: start = to_dt(df.index[0].to_pydatetime()) image_start = initial_image.get('index', start) image = {k: v for k, v in initial_image.items() if k != 'index'} rtn[IMAGE_DOC] = {IMAGE_TIME: image_start, IMAGE: initial_image} final_image = TickStore._pandas_compute_final_image(df, initial_image, end) else: start = to_dt(df.index[0].to_pydatetime()) final_image = {} rtn[END] = end rtn[START] = start logger.warning("NB treating all values as 'exists' - no longer sparse") rowmask = Binary(compressHC(np.packbits(np.ones(len(df), dtype='uint8')).tostring())) index_name = df.index.names[0] or "index" recs = df.to_records(convert_datetime64=False) for col in df: array = TickStore._ensure_supported_dtypes(recs[col]) col_data = {} col_data[DATA] = Binary(compressHC(array.tostring())) col_data[ROWMASK] = rowmask col_data[DTYPE] = TickStore._str_dtype(array.dtype) rtn[COLUMNS][col] = col_data rtn[INDEX] = Binary(compressHC(np.concatenate(([recs[index_name][0].astype('datetime64[ms]').view('uint64')], np.diff(recs[index_name].astype('datetime64[ms]').view('uint64')))).tostring())) return rtn, final_image
def _to_bucket(ticks, symbol, initial_image): rtn = { SYMBOL: symbol, VERSION: CHUNK_VERSION_NUMBER, COLUMNS: {}, COUNT: len(ticks) } data = {} rowmask = {} start = to_dt(ticks[0]['index']) end = to_dt(ticks[-1]['index']) final_image = copy.copy(initial_image) if initial_image else {} for i, t in enumerate(ticks): if initial_image: final_image.update(t) for k, v in iteritems(t): try: if k != 'index': rowmask[k][i] = 1 else: v = TickStore._to_ms(v) if data[k][-1] > v: raise UnorderedDataException( "Timestamps out-of-order: %s > %s" % (ms_to_datetime(data[k][-1]), t)) data[k].append(v) except KeyError: if k != 'index': rowmask[k] = np.zeros(len(ticks), dtype='uint8') rowmask[k][i] = 1 data[k] = [v] rowmask = dict([(k, Binary(compressHC(np.packbits(v).tostring()))) for k, v in iteritems(rowmask)]) for k, v in iteritems(data): if k != 'index': v = np.array(v) v = TickStore._ensure_supported_dtypes(v) rtn[COLUMNS][k] = { DATA: Binary(compressHC(v.tostring())), DTYPE: TickStore._str_dtype(v.dtype), ROWMASK: rowmask[k] } if initial_image: image_start = initial_image.get('index', start) if image_start > start: raise UnorderedDataException( "Image timestamp is after first tick: %s > %s" % (image_start, start)) start = min(start, image_start) rtn[IMAGE_DOC] = {IMAGE_TIME: image_start, IMAGE: initial_image} rtn[END] = end rtn[START] = start rtn[INDEX] = Binary( compressHC( np.concatenate( ([data['index'][0]], np.diff(data['index']))).tostring())) return rtn, final_image
def test_read_backward_compatibility(): """Test backwards compatibility with a pickled file that's created with Python 2.7.3, Numpy 1.7.1_ahl2 and Pandas 0.14.1 """ fname = path.join(path.dirname(__file__), "data", "test-data.pkl") # For newer versions; verify that unpickling fails when using cPickle if PANDAS_VERSION >= LooseVersion("0.16.1"): if sys.version_info[0] >= 3: with pytest.raises(UnicodeDecodeError), open(fname) as fh: cPickle.load(fh) else: with pytest.raises(TypeError), open(fname) as fh: cPickle.load(fh) # Verify that PickleStore() uses a backwards compatible unpickler. store = PickleStore() with open(fname) as fh: # PickleStore compresses data with lz4 version = {'blob': compressHC(fh.read())} df = store.read(sentinel.arctic_lib, version, sentinel.symbol) expected = pd.DataFrame(range(4), pd.date_range(start="20150101", periods=4)) assert (df == expected).all().all()
def test_pickle_store_future_version(): data = {'foo': b'abcdefghijklmnopqrstuvwxyz'} version = {'_id': sentinel._id, 'blob': '__chunked__VERSION_ONE_MILLION'} coll = Mock() arctic_lib = Mock() datap = compressHC(cPickle.dumps(data, protocol=cPickle.HIGHEST_PROTOCOL)) data_1 = datap[0:5] data_2 = datap[5:] coll.find.return_value = [ { 'data': Binary(data_1), 'symbol': 'sentinel.symbol', 'segment': 0 }, { 'data': Binary(data_2), 'symbol': 'sentinel.symbol', 'segment': 1 }, ] arctic_lib.get_top_level_collection.return_value = coll ps = PickleStore() with pytest.raises(UnsupportedPickleStoreVersion) as e: ps.read(arctic_lib, version, sentinel.symbol) assert ('unsupported version of pickle store' in str(e))
def _to_bucket(ticks, symbol, initial_image): rtn = {SYMBOL: symbol, VERSION: CHUNK_VERSION_NUMBER, COLUMNS: {}, COUNT: len(ticks)} data = {} rowmask = {} start = to_dt(ticks[0]['index']) end = to_dt(ticks[-1]['index']) final_image = copy.copy(initial_image) if initial_image else {} for i, t in enumerate(ticks): if initial_image: final_image.update(t) for k, v in iteritems(t): try: if k != 'index': rowmask[k][i] = 1 else: v = TickStore._to_ms(v) if data[k][-1] > v: raise UnorderedDataException("Timestamps out-of-order: %s > %s" % ( ms_to_datetime(data[k][-1]), t)) data[k].append(v) except KeyError: if k != 'index': rowmask[k] = np.zeros(len(ticks), dtype='uint8') rowmask[k][i] = 1 data[k] = [v] rowmask = dict([(k, Binary(compressHC(np.packbits(v).tostring()))) for k, v in iteritems(rowmask)]) for k, v in iteritems(data): if k != 'index': v = np.array(v) v = TickStore._ensure_supported_dtypes(v) rtn[COLUMNS][k] = {DATA: Binary(compressHC(v.tostring())), DTYPE: TickStore._str_dtype(v.dtype), ROWMASK: rowmask[k]} if initial_image: image_start = initial_image.get('index', start) if image_start > start: raise UnorderedDataException("Image timestamp is after first tick: %s > %s" % ( image_start, start)) start = min(start, image_start) rtn[IMAGE_DOC] = {IMAGE_TIME: image_start, IMAGE: initial_image} rtn[END] = end rtn[START] = start rtn[INDEX] = Binary(compressHC(np.concatenate(([data['index'][0]], np.diff(data['index']))).tostring())) return rtn, final_image
def test_unpickle_highest_protocol(): """Pandas version 0.14.1 fails to unpickle a pandas.Series() in compat mode if the container has been pickled with HIGHEST_PROTOCOL. """ version = { 'blob': compressHC(cPickle.dumps(pd.Series(), protocol=cPickle.HIGHEST_PROTOCOL)), } store = PickleStore() ps = store.read(sentinel.arctic_lib, version, sentinel.symbol) expected = pd.Series() assert (ps == expected).all()
def test_read_object_2(): self = create_autospec(PickleStore) version = {'_id': sentinel._id, 'blob': '__chunked__'} coll = Mock() arctic_lib = Mock() coll.find.return_value = [{'data': Binary(compressHC(cPickle.dumps(object))), 'symbol': 'sentinel.symbol'} ] arctic_lib.get_top_level_collection.return_value = coll assert PickleStore.read(self, arctic_lib, version, sentinel.symbol) == object assert coll.find.call_args_list == [call({'symbol': sentinel.symbol, 'parent': sentinel._id}, sort=[('segment', 1)])]
def bench_single(repeats, _strarr, use_HC): # Arctic compress single measurements = [] for i in range(repeats): now = dt.now() if use_HC: res = [c.compressHC(x) for x in _strarr] else: res = [c.compress(x) for x in _strarr] sample = (dt.now() - now).total_seconds() assert all(res) measurements.append(sample) return measurements
def test_unpickle_highest_protocol(): """Pandas version 0.14.1 fails to unpickle a pandas.Series() in compat mode if the container has been pickled with HIGHEST_PROTOCOL. """ version = { 'blob': compressHC( cPickle.dumps(pd.Series(), protocol=cPickle.HIGHEST_PROTOCOL)), } store = PickleStore() ps = store.read(sentinel.arctic_lib, version, sentinel.symbol) expected = pd.Series() assert (ps == expected).all()
def test_performance_sequential(n, length): _str = random_string(length) _strarr = [_str for _ in range(n)] now = dt.now() [c.decompress(y) for y in [c.compressHC(x) for x in _strarr]] clz4_time = (dt.now() - now).total_seconds() now = dt.now() c.decompress_array(c.compressHC_array(_strarr)) clz4_time_p = (dt.now() - now).total_seconds() now = dt.now() [lz4_decompress(y) for y in [lz4_compress(x) for x in _strarr]] lz4_time = (dt.now() - now).total_seconds() print() print("LZ4 Test %sx len:%s" % (n, length)) print(" LZ4 HC %s s" % clz4_time) print(" LZ4 HC Parallel %s s" % clz4_time_p) print(" LZ4 %s s" % lz4_time)
def test_pickle_chunk_V1_read(): data = {'foo': b'abcdefghijklmnopqrstuvwxyz'} version = {'_id': sentinel._id, 'blob': '__chunked__'} coll = Mock() arctic_lib = Mock() datap = compressHC(cPickle.dumps(data, protocol=cPickle.HIGHEST_PROTOCOL)) data_1 = datap[0:5] data_2 = datap[5:] coll.find.return_value = [{'data': Binary(data_1), 'symbol': 'sentinel.symbol', 'segment': 0}, {'data': Binary(data_2), 'symbol': 'sentinel.symbol', 'segment': 1}, ] arctic_lib.get_top_level_collection.return_value = coll ps = PickleStore() assert(data == ps.read(arctic_lib, version, sentinel.symbol))
def test_read_object_2(): self = create_autospec(PickleStore) version = {'_id': sentinel._id, 'blob': '__chunked__'} coll = Mock() arctic_lib = Mock() coll.find.return_value = [{ 'data': Binary(compressHC(cPickle.dumps(object))), 'symbol': 'sentinel.symbol' }] arctic_lib.get_top_level_collection.return_value = coll assert PickleStore.read(self, arctic_lib, version, sentinel.symbol) == object assert coll.find.call_args_list == [ call({ 'symbol': sentinel.symbol, 'parent': sentinel._id }, sort=[('segment', 1)]) ]
def test_pickle_store_future_version(): data = {'foo': b'abcdefghijklmnopqrstuvwxyz'} version = {'_id': sentinel._id, 'blob': '__chunked__VERSION_ONE_MILLION'} coll = Mock() arctic_lib = Mock() datap = compressHC(cPickle.dumps(data, protocol=cPickle.HIGHEST_PROTOCOL)) data_1 = datap[0:5] data_2 = datap[5:] coll.find.return_value = [{'data': Binary(data_1), 'symbol': 'sentinel.symbol', 'segment': 0}, {'data': Binary(data_2), 'symbol': 'sentinel.symbol', 'segment': 1}, ] arctic_lib.get_top_level_collection.return_value = coll ps = PickleStore() with pytest.raises(UnsupportedPickleStoreVersion) as e: ps.read(arctic_lib, version, sentinel.symbol) assert('unsupported version of pickle store' in str(e))
def test_pickle_chunk_V1_read(): data = {'foo': b'abcdefghijklmnopqrstuvwxyz'} version = {'_id': sentinel._id, 'blob': '__chunked__'} coll = Mock() arctic_lib = Mock() datap = compressHC(cPickle.dumps(data, protocol=cPickle.HIGHEST_PROTOCOL)) data_1 = datap[0:5] data_2 = datap[5:] coll.find.return_value = [ { 'data': Binary(data_1), 'symbol': 'sentinel.symbol', 'segment': 0 }, { 'data': Binary(data_2), 'symbol': 'sentinel.symbol', 'segment': 1 }, ] arctic_lib.get_top_level_collection.return_value = coll ps = PickleStore() assert (data == ps.read(arctic_lib, version, sentinel.symbol))
def test_read_object_backwards_compat(): self = create_autospec(PickleStore) version = {'blob': Binary(compressHC(cPickle.dumps(object)))} assert PickleStore.read(self, sentinel.arctic_lib, version, sentinel.symbol) == object