def test_bucket(self): # test different buckets actually separate objects by the same name # -- data foo_store = OmegaStore(bucket='foo') bar_store = OmegaStore(bucket='bar') foo_store.register_backend(PythonRawFileBackend.KIND, PythonRawFileBackend) bar_store.register_backend(PythonRawFileBackend.KIND, PythonRawFileBackend) foo_data = {'foo': 'bar', 'bax': 'fox'} bar_data = {'foo': 'bax', 'bax': 'foz'} foo_store.put(foo_data, 'data') bar_store.put(bar_data, 'data') self.assertEqual(foo_store.get('data')[0], foo_data) self.assertEqual(bar_store.get('data')[0], bar_data) # -- files foo_data = "some data" file_like = BytesIO(foo_data.encode('utf-8')) foo_store.put(file_like, 'myfile') bar_data = "some other data" file_like = BytesIO(bar_data.encode('utf-8')) bar_store.put(file_like, 'myfile') self.assertNotEqual( foo_store.get('myfile').read(), bar_store.get('myfile').read())
def test_get_dataframe_colspec_opspec(self): # create some dataframe df = pd.DataFrame({ 'a': list(range(1, 10)), 'b': list(range(1, 10)), 'c': list(range(1, 10)), }) store = OmegaStore(prefix='') store.put(df, 'mydata') # check we can specify [] and # qualifiers value = store.get('mydata[a]#') self.assertTrue(hasattr(value, '__next__')) nvalue = next(value) self.assertEqual(len(nvalue), len(df)) assert_frame_equal(nvalue, df[['a']]) # check we can specify specific operator value = store.get('mydata[a,b]#iterchunks') nvalue = next(value) self.assertTrue(hasattr(value, '__next__')) self.assertEqual(len(nvalue), len(df)) assert_frame_equal(nvalue, df[['a', 'b']]) # check we can specify kwargs value = store.get('mydata[a,b]#iterchunks:chunksize=1') nvalue = next(value) self.assertTrue(hasattr(value, '__next__')) self.assertEqual(len(nvalue), 1) assert_frame_equal(nvalue, df[['a', 'b']].iloc[0:1])
def test_put_dataframe_timestamp(self): # create some dataframe from datetime import datetime df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))}) store = OmegaStore(prefix='') # -- check default timestamp now = datetime.utcnow() store.put(df, 'mydata', append=False, timestamp=True) df2 = store.get('mydata') _created = df2['_created'].astype(datetime).unique()[0].to_pydatetime() self.assertEqual(_created.replace(second=0, microsecond=0), now.replace(second=0, microsecond=0)) # -- check custom timestamp column, default value now = datetime.utcnow() store.put(df, 'mydata', append=False, timestamp='CREATED') df2 = store.get('mydata') _created = df2['CREATED'].astype(datetime).unique()[0].to_pydatetime() self.assertEqual(_created.replace(second=0, microsecond=0), now.replace(second=0, microsecond=0)) # -- check custom timestamp column, value as tuple now = datetime.utcnow() - timedelta(days=1) store.put(df, 'mydata', append=False, timestamp=('CREATED', now)) df2 = store.get('mydata') _created = df2['CREATED'].astype(datetime).unique()[0].to_pydatetime() self.assertEqual(_created.replace(second=0, microsecond=0), now.replace(second=0, microsecond=0)) # set a day in the past to avoid accidentally creating the current # datetime in mongo now = datetime.now() - timedelta(days=1) store.put(df, 'mydata', timestamp=now, append=False) df2 = store.get('mydata') # compare the data _created = df2['_created'].astype(datetime).unique()[0].to_pydatetime() self.assertEqual(_created.replace(microsecond=0), now.replace(microsecond=0))
def test_store_dataframe_as_dfgroup(self): data = { 'a': list(range(1, 10)), 'b': list(range(1, 10)) } result_data = { 'a': list(range(1, 2)), 'b': 1, } df = pd.DataFrame(data) result_df = pd.DataFrame(result_data) store = OmegaStore() groupby_columns = ['b'] meta = store.put(df, 'dfgroup', groupby=groupby_columns) self.assertEqual(meta.kind, 'pandas.dfgroup') # make sure the collection is created self.assertIn( 'omegaml.dfgroup.datastore', store.mongodb.collection_names()) # note column order can differ due to insertion order since pandas 0.25.1 # hence using [] to ensure same column order for both expected, result df2 = store.get('dfgroup', kwargs={'b': 1}) self.assertTrue(df2.equals(result_df[df2.columns])) df3 = store.get('dfgroup') self.assertTrue(df3.equals(df[df3.columns])) df4 = store.get('dfgroup', kwargs={'a': 1}) self.assertTrue(df4.equals(result_df[df4.columns]))
def test_store_with_metadata(self): om = OmegaStore(prefix='') # dict data = { 'a': list(range(1, 10)), 'b': list(range(1, 10)) } attributes = {'foo': 'bar'} meta = om.put(data, 'data', attributes=attributes) self.assertEqual(meta.kind, 'python.data') self.assertEqual(meta.attributes, attributes) data2 = om.get('data') self.assertEqual([data], data2) # dataframe df = pd.DataFrame(data) meta = om.put(df, 'datadf', attributes=attributes) self.assertEqual(meta.kind, 'pandas.dfrows') self.assertEqual(meta.attributes, attributes) df2 = om.get('datadf') assert_frame_equal(df, df2) # model lr = LogisticRegression(solver='liblinear', multi_class='auto') meta = om.put(lr, 'mymodel', attributes=attributes) self.assertEqual(meta.kind, 'sklearn.joblib') self.assertEqual(meta.attributes, attributes) lr2 = om.get('mymodel') self.assertIsInstance(lr2, LogisticRegression)
def test_migrate_unhashed_name(self): store = OmegaStore(bucket='foo', prefix='foo/') df = pd.DataFrame({'x': range(100)}) long_name = 'a' * 10 raised = False error = '' # save as unhashed (old version) store.defaults.OMEGA_STORE_HASHEDNAMES = False meta_unhashed = store.put(df, long_name) # simulate upgrade, no migration store.defaults.OMEGA_STORE_HASHEDNAMES = True # check we can still retrieve dfx = store.get(long_name) assert_frame_equal(df, dfx) # migrate store.defaults.OMEGA_STORE_HASHEDNAMES = True migrate_unhashed_datasets(store) meta_migrated = store.metadata(long_name) # check we can still retrieve after migration dfx = store.get(long_name) assert_frame_equal(df, dfx) # stored hashed meta_hashed = store.put(df, long_name, append=False) # check migration worked as expected self.assertNotEqual(meta_unhashed.collection, meta_hashed.collection) self.assertEqual(meta_migrated.collection, meta_hashed.collection)
def test_put_dataframe_multiple(self): # create some dataframe df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))}) store = OmegaStore(prefix='') store.put(df, 'mydata') df2 = store.get('mydata') self.assertTrue(df.equals(df2), "expected dataframes to be equal") # add again store.put(df, 'mydata') df2 = store.get('mydata') self.assertEqual( len(df) * 2, len(df2), "expected dataframes to be equal")
def test_raw_files(self): store = OmegaStore() store.register_backend(PythonRawFileBackend.KIND, PythonRawFileBackend) # test we can write from a file-like object data = "some data" file_like = BytesIO(data.encode('utf-8')) store.put(file_like, 'myfile') self.assertEqual(data.encode('utf-8'), store.get('myfile').read()) # test we can write from an actual file data = "some other data" file_like = BytesIO(data.encode('utf-8')) with open('/tmp/testfile.txt', 'wb') as fout: fout.write(file_like.read()) store.put('/tmp/testfile.txt', 'myfile') self.assertEqual(data.encode('utf-8'), store.get('myfile').read())
def test_store_irregular_column_names(self): """ test storing irregular column names """ df = pd.DataFrame({'x_1': range(10)}) store = OmegaStore() store.put(df, 'foo', append=False) df2 = store.get('foo') self.assertEqual(df.columns, df2.columns)
def test_existing_arbitrary_collection_flat(self): data = {'foo': 'bar', 'bax': 'fox'} store = OmegaStore() store.register_backend(PandasRawDictBackend.KIND, PandasRawDictBackend) foo_coll = store.mongodb['foo'] foo_coll.insert(data) store.make_metadata('myfoo', collection='foo', kind='pandas.rawdict').save() self.assertIn('myfoo', store.list()) # test we get back _id column if raw=True data_ = store.get('myfoo', raw=True) assert_frame_equal(json_normalize(data), data_) # test we get just the data column data_ = store.get('myfoo', raw=False) cols = ['foo', 'bax'] assert_frame_equal(json_normalize(data)[cols], data_[cols])
def test_get_dataframe_projected_mixin(self): # create some dataframe df = pd.DataFrame({ 'a': list(range(1, 10)), 'b': list(range(1, 10)), 'c': list(range(1, 10)), }) store = OmegaStore(prefix='') store.put(df, 'mydata') # filter in mongodb specs = ['a', ':b', ':', 'b:', '^c'] for spec in specs: name_spec = 'mydata[{}]'.format(spec) df2 = store.get(name_spec) # filter local dataframe if spec == ':': dfx = df.loc[:, :] elif ':' in spec: from_col, to_col = spec.split(':') slice_ = slice(from_col or None, to_col or None) dfx = df.loc[:, slice_] elif spec.startswith('^'): spec_cols = spec[1:].split(',') cols = [col for col in df.columns if col not in spec_cols] dfx = df[cols] else: dfx = df[[spec]] self.assertTrue(dfx.equals(df2), "expected dataframes to be equal")
def test_put_python_dict(self): # create some data data = {'a': list(range(1, 10)), 'b': list(range(1, 10))} store = OmegaStore(prefix='') store.put(data, 'mydata') data2 = store.get('mydata') self.assertEquals([data], data2)
def test_lazy_unique(self): """ test getting a MDataFrame and unique values """ data = {'a': list(range(1, 10)), 'b': list(range(1, 10))} df = pd.DataFrame(data) store = OmegaStore() meta = store.put(df, 'foo', append=False) val = store.get('foo', lazy=True).a.unique().value self.assertListEqual(data['a'], list(val))
def test_store_dict_in_df(self): df = pd.DataFrame({ 'x': [{'foo': 'bar '}], }) store = OmegaStore() store.put(df, 'test-dict', append=False) df2 = store.get('test-dict') testing.assert_frame_equal(df, df2)
def test_arbitrary_collection_new(self): data = {'foo': 'bar', 'bax': 'fox'} store = OmegaStore() store.register_backend(PandasRawDictBackend.KIND, PandasRawDictBackend) # create the collection foo_coll = store.mongodb['foo'] foo_coll.insert(data) # store the collection as is store.put(foo_coll, 'myfoo').save() self.assertIn('myfoo', store.list()) # test we get back _id column if raw=True data_ = store.get('myfoo', raw=True) assert_frame_equal(data_, json_normalize(data)) # test we get just the data column data_ = store.get('myfoo', raw=False) cols = ['foo', 'bax'] assert_frame_equal(data_[cols], json_normalize(data)[cols])
def test_store_series(self): """ test storing a pandas series with it's own index """ from string import ascii_lowercase series = pd.Series(range(10), index=(c for c in ascii_lowercase[0:10])) store = OmegaStore() store.put(series, 'fooseries', append=False) series2 = store.get('fooseries') assert_series_equal(series, series2)
def test_store_datetime(self): """ test storing naive datetimes """ df = pd.DataFrame( {'x': pd.date_range(datetime(2016, 1, 1), datetime(2016, 1, 10))}) store = OmegaStore() store.put(df, 'test-date', append=False) df2 = store.get('test-date') assert_frame_equal(df, df2)
def test_store_tz_datetime(self): """ test storing timezoned datetimes """ df = pd.DataFrame({ 'y': pd.date_range('2019-10-01', periods=5, tz='US/Eastern', normalize=True) }) store = OmegaStore() store.put(df, 'test-date', append=False) df2 = store.get('test-date') testing.assert_frame_equal(df, df2)
def test_get_forced_python(self): """ this tests we can retrieve data as python values the purpose is to test the basic mode of OmegaStore in case pandas and scikit learn are not available """ store = OmegaStore(prefix='') # pure data data = { 'a': list(range(1, 10)), 'b': list(range(1, 10)) } meta = store.put(data, 'data') data2 = store.get('data', force_python=True) self.assertEqual(data, data2) # dataframe # create some dataframe df = pd.DataFrame({ 'a': list(range(1, 10)), 'b': list(range(1, 10)) }) store.put(df, 'mydata') df2 = store.get('mydata', force_python=True) df2 = pd.DataFrame(df2) real_cols = [col for col in df2.columns if (col != '_id' and not col.startswith('_idx') and not col.startswith('_om'))] df2 = df2[real_cols] self.assertTrue(df.equals(df2), "expected dataframes to be equal") # model iris = load_iris() X = iris.data Y = iris.target lr = LogisticRegression(solver='liblinear', multi_class='auto') lr.fit(X, Y) # store it remote store.put(lr, 'foo') # get it back as a zipfile lr2file = store.get('foo', force_python=True) contents = lr2file.read() with ZipFile(BytesIO(contents)) as zipf: self.assertIn('foo', zipf.namelist())
def test_get_dataframe_project(self): # create some dataframe df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))}) store = OmegaStore(prefix='') store.put(df, 'mydata') # filter in mongodb df2 = store.get('mydata', columns=['a']) # filter local dataframe df = df[['a']] self.assertTrue(df.equals(df2), "expected dataframes to be equal")
def test_get_dataframe_filter(self): # create some dataframe df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))}) store = OmegaStore(prefix='') store.put(df, 'mydata') # filter in mongodb df2 = store.get('mydata', filter=dict(a__gt=1, a__lt=10)) # filter local dataframe df = df[(df.a > 1) & (df.a < 10)] self.assertTrue(df.equals(df2), "expected dataframes to be equal")
def test_store_series_timeindex(self): """ test storing a pandas series with it's own index """ series = pd.Series(range(10), name='foo', index=pd.date_range(pd.datetime(2016, 1, 1), pd.datetime(2016, 1, 10))) store = OmegaStore() store.put(series, 'fooseries', append=False) series2 = store.get('fooseries') assert_series_equal(series, series2)
def test_put_python_dict_multiple(self): # create some data data = {'a': list(range(1, 10)), 'b': list(range(1, 10))} store = OmegaStore(prefix='') store.put(data, 'mydata') store.put(data, 'mydata') data2 = store.get('mydata') # we will have stored the same object twice self.assertEquals(data, data2[0]) self.assertEquals(data, data2[1])
def test_put_dataframe_xtra_large(self): # create some dataframe # force fast insert df = pd.DataFrame({ 'a': list(range(0, int(1e4 + 1))), 'b': list(range(0, int(1e4 + 1))) }) store = OmegaStore(prefix='') store.put(df, 'mydata') df2 = store.get('mydata') self.assertTrue(df.equals(df2), "expected dataframes to be equal")
def test_store_dataframe_as_dfgroup(self): data = {'a': list(range(1, 10)), 'b': list(range(1, 10))} result_data = { 'a': list(range(1, 2)), 'b': 1, } df = pd.DataFrame(data) result_df = pd.DataFrame(result_data) store = OmegaStore() groupby_columns = ['b'] meta = store.put(df, 'dfgroup', groupby=groupby_columns) self.assertEqual(meta.kind, 'pandas.dfgroup') # make sure the collection is created self.assertIn('omegaml.dfgroup.datastore', store.mongodb.collection_names()) df2 = store.get('dfgroup', kwargs={'b': 1}) self.assertTrue(df2.equals(result_df)) df3 = store.get('dfgroup') self.assertTrue(df3.equals(df)) df4 = store.get('dfgroup', kwargs={'a': 1}) self.assertTrue(df4.equals(result_df))
def test_store_dataframe_as_hdf(self): data = {'a': list(range(1, 10)), 'b': list(range(1, 10))} df = pd.DataFrame(data) store = OmegaStore() meta = store.put(df, 'foo', as_hdf=True) self.assertEqual(meta.kind, 'pandas.hdf') # make sure the hdf file is actually there meta = store.metadata('foo') self.assertIn(meta.gridfile.name, store.fs.list()) df2 = store.get('foo') self.assertTrue(df.equals(df2), "dataframes differ") # test for non-existent file raises exception meta = store.put(df2, 'foo_will_be_removed', as_hdf=True) meta = store.metadata('foo_will_be_removed') file_id = store.fs.get_last_version(meta.gridfile.name)._id store.fs.delete(file_id) store2 = OmegaStore() with self.assertRaises(gridfs.errors.NoFile): store2.get('foo_will_be_removed') # test hdf file is not there self.assertNotIn('hdfdf.hdf', store2.fs.list())
def test_migrate_unhashed_name_hdf(self): store = OmegaStore(bucket='foo', prefix='foo/') df = pd.DataFrame({'x': range(100)}) long_name = 'a' * 10 raised = False error = '' # save as unhashed (old version) store.defaults.OMEGA_STORE_HASHEDNAMES = False store.put(df, long_name, as_hdf=True) meta_unhashed = store.metadata(long_name) # retrieve should still work store.defaults.OMEGA_STORE_HASHEDNAMES = True dfx = store.get(long_name) assert_frame_equal(df, dfx) # stored hashed store.put(df, long_name, replace=True, as_hdf=True) meta_hashed = store.metadata(long_name) dfx = store.get(long_name) assert_frame_equal(df, dfx) # check hashing actually worked self.assertNotEqual(meta_unhashed.gridfile.name, meta_hashed.gridfile.name)
def test_put_dataframe_timeseries(self): # create some dataframe tsidx = pd.date_range(pd.datetime(2016, 1, 1), pd.datetime(2016, 4, 1)) df = pd.DataFrame({ 'a': list(range(0, len(tsidx))), 'b': list(range(0, len(tsidx))) }, index=tsidx) store = OmegaStore(prefix='') store.put(df, 'mydata') dfx = store.get('mydata') assert_frame_equal(df, dfx) idxs = list(store.collection('mydata').list_indexes()) idx_names = [dict(v).get('name') for v in idxs] self.assertIn('asc__idx#0_0', idx_names)
def test_store_tz_datetime_dst(self): """ test storing timezoned datetimes """ # 2019 11 03 02:00 is the end of US DST https://www.timeanddate.com/time/dst/2019.html # pymongo will transform the object into a naive dt at UTC time at +3h (arguably incorrectly so) # while pandas creates the Timestamp as UTC -4 (as the day starts at 00:00, not 02:00). # On rendering back to a tz-aware datetime, this yields the wrong date (1 day eaerlier) because # pandas applies -4 on converting from UTC to US/Eastern (correctly). df = pd.DataFrame({ 'y': pd.date_range('2019-11-01', periods=5, tz='US/Eastern', normalize=True) }) store = OmegaStore() store.put(df, 'test-date', append=False) df2 = store.get('test-date') # currently this fails, see @skip reason testing.assert_frame_equal(df, df2)
def test_put_dataframe_multiindex(self): # create some dataframe store = OmegaStore(prefix='') midx = pd.MultiIndex(levels=[[u'bar', u'baz', u'foo', u'qux'], [u'one', u'two']], labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]], names=[u'first', u'second']) df = pd.DataFrame({'x': range(0, len(midx))}, index=midx) store.put(df, 'mydata') dfx = store.get('mydata') assert_frame_equal(df, dfx) idxs = list(store.collection('mydata').list_indexes()) idx_names = [dict(v).get('name') for v in idxs] self.assertIn('asc__idx#0_first__asc__idx#1_second', idx_names)