Example #1
0
    def _pandas_to_bucket(df, symbol, initial_image):
        rtn = {SYMBOL: symbol, VERSION: CHUNK_VERSION_NUMBER, COLUMNS: {}, COUNT: len(df)}
        end = to_dt(df.index[-1].to_pydatetime())
        if initial_image :
            if 'index' in initial_image:
                start = min(to_dt(df.index[0].to_pydatetime()), initial_image['index'])
            else:
                start = to_dt(df.index[0].to_pydatetime())
            image_start = initial_image.get('index', start)
            image = {k: v for k, v in initial_image.items() if k != 'index'}
            rtn[IMAGE_DOC] = {IMAGE_TIME: image_start, IMAGE: initial_image}
            final_image = TickStore._pandas_compute_final_image(df, initial_image, end)
        else:
            start = to_dt(df.index[0].to_pydatetime())
            final_image = {}
        rtn[END] = end
        rtn[START] = start

        logger.warning("NB treating all values as 'exists' - no longer sparse")
        rowmask = Binary(compressHC(np.packbits(np.ones(len(df), dtype='uint8')).tostring()))

        index_name = df.index.names[0] or "index"
        recs = df.to_records(convert_datetime64=False)
        for col in df:
            array = TickStore._ensure_supported_dtypes(recs[col])
            col_data = {}
            col_data[DATA] = Binary(compressHC(array.tostring()))
            col_data[ROWMASK] = rowmask
            col_data[DTYPE] = TickStore._str_dtype(array.dtype)
            rtn[COLUMNS][col] = col_data
        rtn[INDEX] = Binary(compressHC(np.concatenate(([recs[index_name][0].astype('datetime64[ms]').view('uint64')],
                                                           np.diff(recs[index_name].astype('datetime64[ms]').view('uint64')))).tostring()))
        return rtn, final_image
Example #2
0
    def _pandas_to_bucket(df, symbol, initial_image):
        rtn = {SYMBOL: symbol, VERSION: CHUNK_VERSION_NUMBER, COLUMNS: {}, COUNT: len(df)}
        end = to_dt(df.index[-1].to_pydatetime())
        if initial_image :
            if 'index' in initial_image:
                start = min(to_dt(df.index[0].to_pydatetime()), initial_image['index'])
            else:
                start = to_dt(df.index[0].to_pydatetime())
            image_start = initial_image.get('index', start)
            image = {k: v for k, v in initial_image.items() if k != 'index'}
            rtn[IMAGE_DOC] = {IMAGE_TIME: image_start, IMAGE: initial_image}
            final_image = TickStore._pandas_compute_final_image(df, initial_image, end)
        else:
            start = to_dt(df.index[0].to_pydatetime())
            final_image = {}
        rtn[END] = end
        rtn[START] = start

        logger.warning("NB treating all values as 'exists' - no longer sparse")
        rowmask = Binary(compressHC(np.packbits(np.ones(len(df), dtype='uint8')).tostring()))

        index_name = df.index.names[0] or "index"
        recs = df.to_records(convert_datetime64=False)
        for col in df:
            array = TickStore._ensure_supported_dtypes(recs[col])
            col_data = {}
            col_data[DATA] = Binary(compressHC(array.tostring()))
            col_data[ROWMASK] = rowmask
            col_data[DTYPE] = TickStore._str_dtype(array.dtype)
            rtn[COLUMNS][col] = col_data
        rtn[INDEX] = Binary(compressHC(np.concatenate(([recs[index_name][0].astype('datetime64[ms]').view('uint64')],
                                                           np.diff(recs[index_name].astype('datetime64[ms]').view('uint64')))).tostring()))
        return rtn, final_image
Example #3
0
    def _to_bucket(ticks, symbol, initial_image):
        rtn = {
            SYMBOL: symbol,
            VERSION: CHUNK_VERSION_NUMBER,
            COLUMNS: {},
            COUNT: len(ticks)
        }
        data = {}
        rowmask = {}
        start = to_dt(ticks[0]['index'])
        end = to_dt(ticks[-1]['index'])
        final_image = copy.copy(initial_image) if initial_image else {}
        for i, t in enumerate(ticks):
            if initial_image:
                final_image.update(t)
            for k, v in iteritems(t):
                try:
                    if k != 'index':
                        rowmask[k][i] = 1
                    else:
                        v = TickStore._to_ms(v)
                        if data[k][-1] > v:
                            raise UnorderedDataException(
                                "Timestamps out-of-order: %s > %s" %
                                (ms_to_datetime(data[k][-1]), t))
                    data[k].append(v)
                except KeyError:
                    if k != 'index':
                        rowmask[k] = np.zeros(len(ticks), dtype='uint8')
                        rowmask[k][i] = 1
                    data[k] = [v]

        rowmask = dict([(k, Binary(compressHC(np.packbits(v).tostring())))
                        for k, v in iteritems(rowmask)])
        for k, v in iteritems(data):
            if k != 'index':
                v = np.array(v)
                v = TickStore._ensure_supported_dtypes(v)
                rtn[COLUMNS][k] = {
                    DATA: Binary(compressHC(v.tostring())),
                    DTYPE: TickStore._str_dtype(v.dtype),
                    ROWMASK: rowmask[k]
                }

        if initial_image:
            image_start = initial_image.get('index', start)
            if image_start > start:
                raise UnorderedDataException(
                    "Image timestamp is after first tick: %s > %s" %
                    (image_start, start))
            start = min(start, image_start)
            rtn[IMAGE_DOC] = {IMAGE_TIME: image_start, IMAGE: initial_image}
        rtn[END] = end
        rtn[START] = start
        rtn[INDEX] = Binary(
            compressHC(
                np.concatenate(
                    ([data['index'][0]], np.diff(data['index']))).tostring()))
        return rtn, final_image
Example #4
0
def test_read_backward_compatibility():
    """Test backwards compatibility with a pickled file that's created with Python 2.7.3,
    Numpy 1.7.1_ahl2 and Pandas 0.14.1
    """
    fname = path.join(path.dirname(__file__), "data", "test-data.pkl")

    # For newer versions; verify that unpickling fails when using cPickle
    if PANDAS_VERSION >= LooseVersion("0.16.1"):
        if sys.version_info[0] >= 3:
            with pytest.raises(UnicodeDecodeError), open(fname) as fh:
                cPickle.load(fh)
        else:
            with pytest.raises(TypeError), open(fname) as fh:
                cPickle.load(fh)

    # Verify that PickleStore() uses a backwards compatible unpickler.
    store = PickleStore()

    with open(fname) as fh:
        # PickleStore compresses data with lz4
        version = {'blob': compressHC(fh.read())}
    df = store.read(sentinel.arctic_lib, version, sentinel.symbol)

    expected = pd.DataFrame(range(4), pd.date_range(start="20150101",
                                                    periods=4))
    assert (df == expected).all().all()
Example #5
0
def test_read_backward_compatibility():
    """Test backwards compatibility with a pickled file that's created with Python 2.7.3,
    Numpy 1.7.1_ahl2 and Pandas 0.14.1
    """
    fname = path.join(path.dirname(__file__), "data", "test-data.pkl")

    # For newer versions; verify that unpickling fails when using cPickle
    if PANDAS_VERSION >= LooseVersion("0.16.1"):
        if sys.version_info[0] >= 3:
            with pytest.raises(UnicodeDecodeError), open(fname) as fh:
                cPickle.load(fh)
        else:
            with pytest.raises(TypeError), open(fname) as fh:
                cPickle.load(fh)

    # Verify that PickleStore() uses a backwards compatible unpickler.
    store = PickleStore()

    with open(fname) as fh:
        # PickleStore compresses data with lz4
        version = {'blob': compressHC(fh.read())}
    df = store.read(sentinel.arctic_lib, version, sentinel.symbol)

    expected = pd.DataFrame(range(4), pd.date_range(start="20150101", periods=4))
    assert (df == expected).all().all()
Example #6
0
def test_pickle_store_future_version():
    data = {'foo': b'abcdefghijklmnopqrstuvwxyz'}
    version = {'_id': sentinel._id, 'blob': '__chunked__VERSION_ONE_MILLION'}
    coll = Mock()
    arctic_lib = Mock()
    datap = compressHC(cPickle.dumps(data, protocol=cPickle.HIGHEST_PROTOCOL))
    data_1 = datap[0:5]
    data_2 = datap[5:]
    coll.find.return_value = [
        {
            'data': Binary(data_1),
            'symbol': 'sentinel.symbol',
            'segment': 0
        },
        {
            'data': Binary(data_2),
            'symbol': 'sentinel.symbol',
            'segment': 1
        },
    ]
    arctic_lib.get_top_level_collection.return_value = coll

    ps = PickleStore()
    with pytest.raises(UnsupportedPickleStoreVersion) as e:
        ps.read(arctic_lib, version, sentinel.symbol)
    assert ('unsupported version of pickle store' in str(e))
Example #7
0
    def _to_bucket(ticks, symbol, initial_image):
        rtn = {SYMBOL: symbol, VERSION: CHUNK_VERSION_NUMBER, COLUMNS: {}, COUNT: len(ticks)}
        data = {}
        rowmask = {}
        start = to_dt(ticks[0]['index'])
        end = to_dt(ticks[-1]['index'])
        final_image = copy.copy(initial_image) if initial_image else {}
        for i, t in enumerate(ticks):
            if initial_image:
                final_image.update(t)
            for k, v in iteritems(t):
                try:
                    if k != 'index':
                        rowmask[k][i] = 1
                    else:
                        v = TickStore._to_ms(v)
                        if data[k][-1] > v:
                            raise UnorderedDataException("Timestamps out-of-order: %s > %s" % (
                                                          ms_to_datetime(data[k][-1]), t))
                    data[k].append(v)
                except KeyError:
                    if k != 'index':
                        rowmask[k] = np.zeros(len(ticks), dtype='uint8')
                        rowmask[k][i] = 1
                    data[k] = [v]

        rowmask = dict([(k, Binary(compressHC(np.packbits(v).tostring())))
                        for k, v in iteritems(rowmask)])
        for k, v in iteritems(data):
            if k != 'index':
                v = np.array(v)
                v = TickStore._ensure_supported_dtypes(v)
                rtn[COLUMNS][k] = {DATA: Binary(compressHC(v.tostring())),
                                   DTYPE: TickStore._str_dtype(v.dtype),
                                   ROWMASK: rowmask[k]}

        if initial_image:
            image_start = initial_image.get('index', start)
            if image_start > start:
                raise UnorderedDataException("Image timestamp is after first tick: %s > %s" % (
                                              image_start, start))
            start = min(start, image_start)
            rtn[IMAGE_DOC] = {IMAGE_TIME: image_start, IMAGE: initial_image}
        rtn[END] = end
        rtn[START] =  start
        rtn[INDEX] = Binary(compressHC(np.concatenate(([data['index'][0]], np.diff(data['index']))).tostring()))
        return rtn, final_image
Example #8
0
def test_unpickle_highest_protocol():
    """Pandas version 0.14.1 fails to unpickle a pandas.Series() in compat mode if the
    container has been pickled with HIGHEST_PROTOCOL.
    """
    version = {
        'blob': compressHC(cPickle.dumps(pd.Series(), protocol=cPickle.HIGHEST_PROTOCOL)),
    }

    store = PickleStore()
    ps = store.read(sentinel.arctic_lib, version, sentinel.symbol)

    expected = pd.Series()
    assert (ps == expected).all()
Example #9
0
def test_read_object_2():
    self = create_autospec(PickleStore)
    version = {'_id': sentinel._id,
               'blob': '__chunked__'}
    coll = Mock()
    arctic_lib = Mock()
    coll.find.return_value = [{'data': Binary(compressHC(cPickle.dumps(object))),
                               'symbol': 'sentinel.symbol'}
                              ]
    arctic_lib.get_top_level_collection.return_value = coll

    assert PickleStore.read(self, arctic_lib, version, sentinel.symbol) == object
    assert coll.find.call_args_list == [call({'symbol': sentinel.symbol, 'parent': sentinel._id}, sort=[('segment', 1)])]
Example #10
0
def bench_single(repeats, _strarr, use_HC):
    # Arctic compress single
    measurements = []
    for i in range(repeats):
        now = dt.now()
        if use_HC:
            res = [c.compressHC(x) for x in _strarr]
        else:
            res = [c.compress(x) for x in _strarr]
        sample = (dt.now() - now).total_seconds()
        assert all(res)
        measurements.append(sample)
    return measurements
Example #11
0
def bench_single(repeats, _strarr, use_HC):
    # Arctic compress single
    measurements = []
    for i in range(repeats):
        now = dt.now()
        if use_HC:
            res = [c.compressHC(x) for x in _strarr]
        else:
            res = [c.compress(x) for x in _strarr]
        sample = (dt.now() - now).total_seconds()
        assert all(res)
        measurements.append(sample)
    return measurements
Example #12
0
def test_unpickle_highest_protocol():
    """Pandas version 0.14.1 fails to unpickle a pandas.Series() in compat mode if the
    container has been pickled with HIGHEST_PROTOCOL.
    """
    version = {
        'blob':
        compressHC(
            cPickle.dumps(pd.Series(), protocol=cPickle.HIGHEST_PROTOCOL)),
    }

    store = PickleStore()
    ps = store.read(sentinel.arctic_lib, version, sentinel.symbol)

    expected = pd.Series()
    assert (ps == expected).all()
Example #13
0
def test_performance_sequential(n, length):
    _str = random_string(length)
    _strarr = [_str for _ in range(n)]
    now = dt.now()
    [c.decompress(y) for y in [c.compressHC(x) for x in _strarr]]
    clz4_time = (dt.now() - now).total_seconds()
    now = dt.now()
    c.decompress_array(c.compressHC_array(_strarr))
    clz4_time_p = (dt.now() - now).total_seconds()
    now = dt.now()
    [lz4_decompress(y) for y in [lz4_compress(x) for x in _strarr]]
    lz4_time = (dt.now() - now).total_seconds()
    print()
    print("LZ4 Test %sx len:%s" % (n, length))
    print("    LZ4 HC %s s" % clz4_time)
    print("    LZ4 HC Parallel %s s" % clz4_time_p)
    print("    LZ4 %s s" % lz4_time)
def test_performance_sequential(n, length):
    _str = random_string(length)
    _strarr = [_str for _ in range(n)]
    now = dt.now()
    [c.decompress(y) for y in [c.compressHC(x) for x in _strarr]]
    clz4_time = (dt.now() - now).total_seconds()
    now = dt.now()
    c.decompress_array(c.compressHC_array(_strarr))
    clz4_time_p = (dt.now() - now).total_seconds()
    now = dt.now()
    [lz4_decompress(y) for y in [lz4_compress(x) for x in _strarr]]
    lz4_time = (dt.now() - now).total_seconds()
    print()
    print("LZ4 Test %sx len:%s" % (n, length))
    print("    LZ4 HC %s s" % clz4_time)
    print("    LZ4 HC Parallel %s s" % clz4_time_p)
    print("    LZ4 %s s" % lz4_time)
Example #15
0
def test_pickle_chunk_V1_read():
    data = {'foo': b'abcdefghijklmnopqrstuvwxyz'}
    version = {'_id': sentinel._id,
               'blob': '__chunked__'}
    coll = Mock()
    arctic_lib = Mock()
    datap = compressHC(cPickle.dumps(data, protocol=cPickle.HIGHEST_PROTOCOL))
    data_1 = datap[0:5]
    data_2 = datap[5:]
    coll.find.return_value = [{'data': Binary(data_1),
                               'symbol': 'sentinel.symbol',
                               'segment': 0},
                              {'data': Binary(data_2),
                               'symbol': 'sentinel.symbol',
                               'segment': 1},
                              ]
    arctic_lib.get_top_level_collection.return_value = coll

    ps = PickleStore()
    assert(data == ps.read(arctic_lib, version, sentinel.symbol))
Example #16
0
def test_read_object_2():
    self = create_autospec(PickleStore)
    version = {'_id': sentinel._id, 'blob': '__chunked__'}
    coll = Mock()
    arctic_lib = Mock()
    coll.find.return_value = [{
        'data': Binary(compressHC(cPickle.dumps(object))),
        'symbol': 'sentinel.symbol'
    }]
    arctic_lib.get_top_level_collection.return_value = coll

    assert PickleStore.read(self, arctic_lib, version,
                            sentinel.symbol) == object
    assert coll.find.call_args_list == [
        call({
            'symbol': sentinel.symbol,
            'parent': sentinel._id
        },
             sort=[('segment', 1)])
    ]
Example #17
0
def test_pickle_store_future_version():
    data = {'foo': b'abcdefghijklmnopqrstuvwxyz'}
    version = {'_id': sentinel._id,
               'blob': '__chunked__VERSION_ONE_MILLION'}
    coll = Mock()
    arctic_lib = Mock()
    datap = compressHC(cPickle.dumps(data, protocol=cPickle.HIGHEST_PROTOCOL))
    data_1 = datap[0:5]
    data_2 = datap[5:]
    coll.find.return_value = [{'data': Binary(data_1),
                               'symbol': 'sentinel.symbol',
                               'segment': 0},
                              {'data': Binary(data_2),
                               'symbol': 'sentinel.symbol',
                               'segment': 1},
                              ]
    arctic_lib.get_top_level_collection.return_value = coll

    ps = PickleStore()
    with pytest.raises(UnsupportedPickleStoreVersion) as e:
        ps.read(arctic_lib, version, sentinel.symbol)
    assert('unsupported version of pickle store' in str(e))
Example #18
0
def test_pickle_chunk_V1_read():
    data = {'foo': b'abcdefghijklmnopqrstuvwxyz'}
    version = {'_id': sentinel._id, 'blob': '__chunked__'}
    coll = Mock()
    arctic_lib = Mock()
    datap = compressHC(cPickle.dumps(data, protocol=cPickle.HIGHEST_PROTOCOL))
    data_1 = datap[0:5]
    data_2 = datap[5:]
    coll.find.return_value = [
        {
            'data': Binary(data_1),
            'symbol': 'sentinel.symbol',
            'segment': 0
        },
        {
            'data': Binary(data_2),
            'symbol': 'sentinel.symbol',
            'segment': 1
        },
    ]
    arctic_lib.get_top_level_collection.return_value = coll

    ps = PickleStore()
    assert (data == ps.read(arctic_lib, version, sentinel.symbol))
Example #19
0
def test_read_object_backwards_compat():
    self = create_autospec(PickleStore)
    version = {'blob': Binary(compressHC(cPickle.dumps(object)))}
    assert PickleStore.read(self, sentinel.arctic_lib, version, sentinel.symbol) == object
Example #20
0
def test_read_object_backwards_compat():
    self = create_autospec(PickleStore)
    version = {'blob': Binary(compressHC(cPickle.dumps(object)))}
    assert PickleStore.read(self, sentinel.arctic_lib, version,
                            sentinel.symbol) == object