Ejemplo n.º 1
0
 def test_bucket(self):
     # test different buckets actually separate objects by the same name
     # -- data
     foo_store = OmegaStore(bucket='foo')
     bar_store = OmegaStore(bucket='bar')
     foo_store.register_backend(PythonRawFileBackend.KIND,
                                PythonRawFileBackend)
     bar_store.register_backend(PythonRawFileBackend.KIND,
                                PythonRawFileBackend)
     foo_data = {'foo': 'bar', 'bax': 'fox'}
     bar_data = {'foo': 'bax', 'bax': 'foz'}
     foo_store.put(foo_data, 'data')
     bar_store.put(bar_data, 'data')
     self.assertEqual(foo_store.get('data')[0], foo_data)
     self.assertEqual(bar_store.get('data')[0], bar_data)
     # -- files
     foo_data = "some data"
     file_like = BytesIO(foo_data.encode('utf-8'))
     foo_store.put(file_like, 'myfile')
     bar_data = "some other data"
     file_like = BytesIO(bar_data.encode('utf-8'))
     bar_store.put(file_like, 'myfile')
     self.assertNotEqual(
         foo_store.get('myfile').read(),
         bar_store.get('myfile').read())
Ejemplo n.º 2
0
 def test_get_dataframe_colspec_opspec(self):
     # create some dataframe
     df = pd.DataFrame({
         'a': list(range(1, 10)),
         'b': list(range(1, 10)),
         'c': list(range(1, 10)),
     })
     store = OmegaStore(prefix='')
     store.put(df, 'mydata')
     # check we can specify [] and # qualifiers
     value = store.get('mydata[a]#')
     self.assertTrue(hasattr(value, '__next__'))
     nvalue = next(value)
     self.assertEqual(len(nvalue), len(df))
     assert_frame_equal(nvalue, df[['a']])
     # check we can specify specific operator
     value = store.get('mydata[a,b]#iterchunks')
     nvalue = next(value)
     self.assertTrue(hasattr(value, '__next__'))
     self.assertEqual(len(nvalue), len(df))
     assert_frame_equal(nvalue, df[['a', 'b']])
     # check we can specify kwargs
     value = store.get('mydata[a,b]#iterchunks:chunksize=1')
     nvalue = next(value)
     self.assertTrue(hasattr(value, '__next__'))
     self.assertEqual(len(nvalue), 1)
     assert_frame_equal(nvalue, df[['a', 'b']].iloc[0:1])
Ejemplo n.º 3
0
 def test_put_dataframe_timestamp(self):
     # create some dataframe
     from datetime import datetime
     df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))})
     store = OmegaStore(prefix='')
     # -- check default timestamp
     now = datetime.utcnow()
     store.put(df, 'mydata', append=False, timestamp=True)
     df2 = store.get('mydata')
     _created = df2['_created'].astype(datetime).unique()[0].to_pydatetime()
     self.assertEqual(_created.replace(second=0, microsecond=0),
                      now.replace(second=0, microsecond=0))
     # -- check custom timestamp column, default value
     now = datetime.utcnow()
     store.put(df, 'mydata', append=False, timestamp='CREATED')
     df2 = store.get('mydata')
     _created = df2['CREATED'].astype(datetime).unique()[0].to_pydatetime()
     self.assertEqual(_created.replace(second=0, microsecond=0),
                      now.replace(second=0, microsecond=0))
     # -- check custom timestamp column, value as tuple
     now = datetime.utcnow() - timedelta(days=1)
     store.put(df, 'mydata', append=False, timestamp=('CREATED', now))
     df2 = store.get('mydata')
     _created = df2['CREATED'].astype(datetime).unique()[0].to_pydatetime()
     self.assertEqual(_created.replace(second=0, microsecond=0),
                      now.replace(second=0, microsecond=0))
     # set a day in the past to avoid accidentally creating the current
     # datetime in mongo
     now = datetime.now() - timedelta(days=1)
     store.put(df, 'mydata', timestamp=now, append=False)
     df2 = store.get('mydata')
     # compare the data
     _created = df2['_created'].astype(datetime).unique()[0].to_pydatetime()
     self.assertEqual(_created.replace(microsecond=0),
                      now.replace(microsecond=0))
Ejemplo n.º 4
0
 def test_store_dataframe_as_dfgroup(self):
     data = {
         'a': list(range(1, 10)),
         'b': list(range(1, 10))
     }
     result_data = {
         'a': list(range(1, 2)),
         'b': 1,
     }
     df = pd.DataFrame(data)
     result_df = pd.DataFrame(result_data)
     store = OmegaStore()
     groupby_columns = ['b']
     meta = store.put(df, 'dfgroup', groupby=groupby_columns)
     self.assertEqual(meta.kind, 'pandas.dfgroup')
     # make sure the collection is created
     self.assertIn(
         'omegaml.dfgroup.datastore', store.mongodb.collection_names())
     # note column order can differ due to insertion order since pandas 0.25.1
     # hence using [] to ensure same column order for both expected, result
     df2 = store.get('dfgroup', kwargs={'b': 1})
     self.assertTrue(df2.equals(result_df[df2.columns]))
     df3 = store.get('dfgroup')
     self.assertTrue(df3.equals(df[df3.columns]))
     df4 = store.get('dfgroup', kwargs={'a': 1})
     self.assertTrue(df4.equals(result_df[df4.columns]))
Ejemplo n.º 5
0
 def test_store_with_metadata(self):
     om = OmegaStore(prefix='')
     # dict
     data = {
         'a': list(range(1, 10)),
         'b': list(range(1, 10))
     }
     attributes = {'foo': 'bar'}
     meta = om.put(data, 'data', attributes=attributes)
     self.assertEqual(meta.kind, 'python.data')
     self.assertEqual(meta.attributes, attributes)
     data2 = om.get('data')
     self.assertEqual([data], data2)
     # dataframe
     df = pd.DataFrame(data)
     meta = om.put(df, 'datadf', attributes=attributes)
     self.assertEqual(meta.kind, 'pandas.dfrows')
     self.assertEqual(meta.attributes, attributes)
     df2 = om.get('datadf')
     assert_frame_equal(df, df2)
     # model
     lr = LogisticRegression(solver='liblinear', multi_class='auto')
     meta = om.put(lr, 'mymodel', attributes=attributes)
     self.assertEqual(meta.kind, 'sklearn.joblib')
     self.assertEqual(meta.attributes, attributes)
     lr2 = om.get('mymodel')
     self.assertIsInstance(lr2, LogisticRegression)
Ejemplo n.º 6
0
 def test_migrate_unhashed_name(self):
     store = OmegaStore(bucket='foo', prefix='foo/')
     df = pd.DataFrame({'x': range(100)})
     long_name = 'a' * 10
     raised = False
     error = ''
     # save as unhashed (old version)
     store.defaults.OMEGA_STORE_HASHEDNAMES = False
     meta_unhashed = store.put(df, long_name)
     # simulate upgrade, no migration
     store.defaults.OMEGA_STORE_HASHEDNAMES = True
     # check we can still retrieve
     dfx = store.get(long_name)
     assert_frame_equal(df, dfx)
     # migrate
     store.defaults.OMEGA_STORE_HASHEDNAMES = True
     migrate_unhashed_datasets(store)
     meta_migrated = store.metadata(long_name)
     # check we can still retrieve after migration
     dfx = store.get(long_name)
     assert_frame_equal(df, dfx)
     # stored hashed
     meta_hashed = store.put(df, long_name, append=False)
     # check migration worked as expected
     self.assertNotEqual(meta_unhashed.collection, meta_hashed.collection)
     self.assertEqual(meta_migrated.collection, meta_hashed.collection)
Ejemplo n.º 7
0
 def test_put_dataframe_multiple(self):
     # create some dataframe
     df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))})
     store = OmegaStore(prefix='')
     store.put(df, 'mydata')
     df2 = store.get('mydata')
     self.assertTrue(df.equals(df2), "expected dataframes to be equal")
     # add again
     store.put(df, 'mydata')
     df2 = store.get('mydata')
     self.assertEqual(
         len(df) * 2, len(df2), "expected dataframes to be equal")
Ejemplo n.º 8
0
 def test_raw_files(self):
     store = OmegaStore()
     store.register_backend(PythonRawFileBackend.KIND, PythonRawFileBackend)
     # test we can write from a file-like object
     data = "some data"
     file_like = BytesIO(data.encode('utf-8'))
     store.put(file_like, 'myfile')
     self.assertEqual(data.encode('utf-8'), store.get('myfile').read())
     # test we can write from an actual file
     data = "some other data"
     file_like = BytesIO(data.encode('utf-8'))
     with open('/tmp/testfile.txt', 'wb') as fout:
         fout.write(file_like.read())
     store.put('/tmp/testfile.txt', 'myfile')
     self.assertEqual(data.encode('utf-8'), store.get('myfile').read())
Ejemplo n.º 9
0
 def test_store_irregular_column_names(self):
     """ test storing irregular column names """
     df = pd.DataFrame({'x_1': range(10)})
     store = OmegaStore()
     store.put(df, 'foo', append=False)
     df2 = store.get('foo')
     self.assertEqual(df.columns, df2.columns)
Ejemplo n.º 10
0
 def test_existing_arbitrary_collection_flat(self):
     data = {'foo': 'bar', 'bax': 'fox'}
     store = OmegaStore()
     store.register_backend(PandasRawDictBackend.KIND, PandasRawDictBackend)
     foo_coll = store.mongodb['foo']
     foo_coll.insert(data)
     store.make_metadata('myfoo', collection='foo',
                         kind='pandas.rawdict').save()
     self.assertIn('myfoo', store.list())
     # test we get back _id column if raw=True
     data_ = store.get('myfoo', raw=True)
     assert_frame_equal(json_normalize(data), data_)
     # test we get just the data column
     data_ = store.get('myfoo', raw=False)
     cols = ['foo', 'bax']
     assert_frame_equal(json_normalize(data)[cols], data_[cols])
Ejemplo n.º 11
0
 def test_get_dataframe_projected_mixin(self):
     # create some dataframe
     df = pd.DataFrame({
         'a': list(range(1, 10)),
         'b': list(range(1, 10)),
         'c': list(range(1, 10)),
     })
     store = OmegaStore(prefix='')
     store.put(df, 'mydata')
     # filter in mongodb
     specs = ['a', ':b', ':', 'b:', '^c']
     for spec in specs:
         name_spec = 'mydata[{}]'.format(spec)
         df2 = store.get(name_spec)
         # filter local dataframe
         if spec == ':':
             dfx = df.loc[:, :]
         elif ':' in spec:
             from_col, to_col = spec.split(':')
             slice_ = slice(from_col or None, to_col or None)
             dfx = df.loc[:, slice_]
         elif spec.startswith('^'):
             spec_cols = spec[1:].split(',')
             cols = [col for col in df.columns if col not in spec_cols]
             dfx = df[cols]
         else:
             dfx = df[[spec]]
         self.assertTrue(dfx.equals(df2), "expected dataframes to be equal")
Ejemplo n.º 12
0
 def test_put_python_dict(self):
     # create some data
     data = {'a': list(range(1, 10)), 'b': list(range(1, 10))}
     store = OmegaStore(prefix='')
     store.put(data, 'mydata')
     data2 = store.get('mydata')
     self.assertEquals([data], data2)
Ejemplo n.º 13
0
 def test_lazy_unique(self):
     """ test getting a MDataFrame and unique values """
     data = {'a': list(range(1, 10)), 'b': list(range(1, 10))}
     df = pd.DataFrame(data)
     store = OmegaStore()
     meta = store.put(df, 'foo', append=False)
     val = store.get('foo', lazy=True).a.unique().value
     self.assertListEqual(data['a'], list(val))
Ejemplo n.º 14
0
 def test_store_dict_in_df(self):
     df = pd.DataFrame({
         'x': [{'foo': 'bar '}],
     })
     store = OmegaStore()
     store.put(df, 'test-dict', append=False)
     df2 = store.get('test-dict')
     testing.assert_frame_equal(df, df2)
Ejemplo n.º 15
0
 def test_arbitrary_collection_new(self):
     data = {'foo': 'bar', 'bax': 'fox'}
     store = OmegaStore()
     store.register_backend(PandasRawDictBackend.KIND, PandasRawDictBackend)
     # create the collection
     foo_coll = store.mongodb['foo']
     foo_coll.insert(data)
     # store the collection as is
     store.put(foo_coll, 'myfoo').save()
     self.assertIn('myfoo', store.list())
     # test we get back _id column if raw=True
     data_ = store.get('myfoo', raw=True)
     assert_frame_equal(data_, json_normalize(data))
     # test we get just the data column
     data_ = store.get('myfoo', raw=False)
     cols = ['foo', 'bax']
     assert_frame_equal(data_[cols], json_normalize(data)[cols])
Ejemplo n.º 16
0
 def test_store_series(self):
     """ test storing a pandas series with it's own index """
     from string import ascii_lowercase
     series = pd.Series(range(10), index=(c for c in ascii_lowercase[0:10]))
     store = OmegaStore()
     store.put(series, 'fooseries', append=False)
     series2 = store.get('fooseries')
     assert_series_equal(series, series2)
Ejemplo n.º 17
0
 def test_store_datetime(self):
     """ test storing naive datetimes """
     df = pd.DataFrame(
         {'x': pd.date_range(datetime(2016, 1, 1), datetime(2016, 1, 10))})
     store = OmegaStore()
     store.put(df, 'test-date', append=False)
     df2 = store.get('test-date')
     assert_frame_equal(df, df2)
Ejemplo n.º 18
0
 def test_store_tz_datetime(self):
     """ test storing timezoned datetimes """
     df = pd.DataFrame({
         'y': pd.date_range('2019-10-01', periods=5, tz='US/Eastern', normalize=True)
     })
     store = OmegaStore()
     store.put(df, 'test-date', append=False)
     df2 = store.get('test-date')
     testing.assert_frame_equal(df, df2)
Ejemplo n.º 19
0
    def test_get_forced_python(self):
        """
        this tests we can retrieve data as python values

        the purpose is to test the basic mode of OmegaStore in
        case pandas and scikit learn are not available
        """
        store = OmegaStore(prefix='')
        # pure data
        data = {
            'a': list(range(1, 10)),
            'b': list(range(1, 10))
        }
        meta = store.put(data, 'data')
        data2 = store.get('data', force_python=True)
        self.assertEqual(data, data2)
        # dataframe
        # create some dataframe
        df = pd.DataFrame({
            'a': list(range(1, 10)),
            'b': list(range(1, 10))
        })
        store.put(df, 'mydata')
        df2 = store.get('mydata', force_python=True)
        df2 = pd.DataFrame(df2)
        real_cols = [col for col in df2.columns
                     if (col != '_id'
                         and not col.startswith('_idx')
                         and not col.startswith('_om'))]
        df2 = df2[real_cols]
        self.assertTrue(df.equals(df2), "expected dataframes to be equal")
        # model
        iris = load_iris()
        X = iris.data
        Y = iris.target
        lr = LogisticRegression(solver='liblinear', multi_class='auto')
        lr.fit(X, Y)
        # store it remote
        store.put(lr, 'foo')
        # get it back as a zipfile
        lr2file = store.get('foo', force_python=True)
        contents = lr2file.read()
        with ZipFile(BytesIO(contents)) as zipf:
            self.assertIn('foo', zipf.namelist())
Ejemplo n.º 20
0
 def test_get_dataframe_project(self):
     # create some dataframe
     df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))})
     store = OmegaStore(prefix='')
     store.put(df, 'mydata')
     # filter in mongodb
     df2 = store.get('mydata', columns=['a'])
     # filter local dataframe
     df = df[['a']]
     self.assertTrue(df.equals(df2), "expected dataframes to be equal")
Ejemplo n.º 21
0
 def test_get_dataframe_filter(self):
     # create some dataframe
     df = pd.DataFrame({'a': list(range(1, 10)), 'b': list(range(1, 10))})
     store = OmegaStore(prefix='')
     store.put(df, 'mydata')
     # filter in mongodb
     df2 = store.get('mydata', filter=dict(a__gt=1, a__lt=10))
     # filter local dataframe
     df = df[(df.a > 1) & (df.a < 10)]
     self.assertTrue(df.equals(df2), "expected dataframes to be equal")
Ejemplo n.º 22
0
 def test_store_series_timeindex(self):
     """ test storing a pandas series with it's own index """
     series = pd.Series(range(10),
                        name='foo',
                        index=pd.date_range(pd.datetime(2016, 1, 1),
                                            pd.datetime(2016, 1, 10)))
     store = OmegaStore()
     store.put(series, 'fooseries', append=False)
     series2 = store.get('fooseries')
     assert_series_equal(series, series2)
Ejemplo n.º 23
0
 def test_put_python_dict_multiple(self):
     # create some data
     data = {'a': list(range(1, 10)), 'b': list(range(1, 10))}
     store = OmegaStore(prefix='')
     store.put(data, 'mydata')
     store.put(data, 'mydata')
     data2 = store.get('mydata')
     # we will have stored the same object twice
     self.assertEquals(data, data2[0])
     self.assertEquals(data, data2[1])
Ejemplo n.º 24
0
 def test_put_dataframe_xtra_large(self):
     # create some dataframe
     # force fast insert
     df = pd.DataFrame({
         'a': list(range(0, int(1e4 + 1))),
         'b': list(range(0, int(1e4 + 1)))
     })
     store = OmegaStore(prefix='')
     store.put(df, 'mydata')
     df2 = store.get('mydata')
     self.assertTrue(df.equals(df2), "expected dataframes to be equal")
Ejemplo n.º 25
0
 def test_store_dataframe_as_dfgroup(self):
     data = {'a': list(range(1, 10)), 'b': list(range(1, 10))}
     result_data = {
         'a': list(range(1, 2)),
         'b': 1,
     }
     df = pd.DataFrame(data)
     result_df = pd.DataFrame(result_data)
     store = OmegaStore()
     groupby_columns = ['b']
     meta = store.put(df, 'dfgroup', groupby=groupby_columns)
     self.assertEqual(meta.kind, 'pandas.dfgroup')
     # make sure the collection is created
     self.assertIn('omegaml.dfgroup.datastore',
                   store.mongodb.collection_names())
     df2 = store.get('dfgroup', kwargs={'b': 1})
     self.assertTrue(df2.equals(result_df))
     df3 = store.get('dfgroup')
     self.assertTrue(df3.equals(df))
     df4 = store.get('dfgroup', kwargs={'a': 1})
     self.assertTrue(df4.equals(result_df))
Ejemplo n.º 26
0
 def test_store_dataframe_as_hdf(self):
     data = {'a': list(range(1, 10)), 'b': list(range(1, 10))}
     df = pd.DataFrame(data)
     store = OmegaStore()
     meta = store.put(df, 'foo', as_hdf=True)
     self.assertEqual(meta.kind, 'pandas.hdf')
     # make sure the hdf file is actually there
     meta = store.metadata('foo')
     self.assertIn(meta.gridfile.name, store.fs.list())
     df2 = store.get('foo')
     self.assertTrue(df.equals(df2), "dataframes differ")
     # test for non-existent file raises exception
     meta = store.put(df2, 'foo_will_be_removed', as_hdf=True)
     meta = store.metadata('foo_will_be_removed')
     file_id = store.fs.get_last_version(meta.gridfile.name)._id
     store.fs.delete(file_id)
     store2 = OmegaStore()
     with self.assertRaises(gridfs.errors.NoFile):
         store2.get('foo_will_be_removed')
     # test hdf file is not there
     self.assertNotIn('hdfdf.hdf', store2.fs.list())
Ejemplo n.º 27
0
 def test_migrate_unhashed_name_hdf(self):
     store = OmegaStore(bucket='foo', prefix='foo/')
     df = pd.DataFrame({'x': range(100)})
     long_name = 'a' * 10
     raised = False
     error = ''
     # save as unhashed (old version)
     store.defaults.OMEGA_STORE_HASHEDNAMES = False
     store.put(df, long_name, as_hdf=True)
     meta_unhashed = store.metadata(long_name)
     # retrieve should still work
     store.defaults.OMEGA_STORE_HASHEDNAMES = True
     dfx = store.get(long_name)
     assert_frame_equal(df, dfx)
     # stored hashed
     store.put(df, long_name, replace=True, as_hdf=True)
     meta_hashed = store.metadata(long_name)
     dfx = store.get(long_name)
     assert_frame_equal(df, dfx)
     # check hashing actually worked
     self.assertNotEqual(meta_unhashed.gridfile.name,
                         meta_hashed.gridfile.name)
Ejemplo n.º 28
0
 def test_put_dataframe_timeseries(self):
     # create some dataframe
     tsidx = pd.date_range(pd.datetime(2016, 1, 1), pd.datetime(2016, 4, 1))
     df = pd.DataFrame({
         'a': list(range(0, len(tsidx))),
         'b': list(range(0, len(tsidx)))
     }, index=tsidx)
     store = OmegaStore(prefix='')
     store.put(df, 'mydata')
     dfx = store.get('mydata')
     assert_frame_equal(df, dfx)
     idxs = list(store.collection('mydata').list_indexes())
     idx_names = [dict(v).get('name') for v in idxs]
     self.assertIn('asc__idx#0_0', idx_names)
Ejemplo n.º 29
0
 def test_store_tz_datetime_dst(self):
     """ test storing timezoned datetimes """
     # 2019 11 03 02:00 is the end of US DST https://www.timeanddate.com/time/dst/2019.html
     # pymongo will transform the object into a naive dt at UTC time at +3h (arguably incorrectly so)
     # while pandas creates the Timestamp as UTC -4 (as the day starts at 00:00, not 02:00).
     # On rendering back to a tz-aware datetime, this yields the wrong date (1 day eaerlier) because
     # pandas applies -4 on converting from UTC to US/Eastern (correctly).
     df = pd.DataFrame({
         'y': pd.date_range('2019-11-01', periods=5, tz='US/Eastern', normalize=True)
     })
     store = OmegaStore()
     store.put(df, 'test-date', append=False)
     df2 = store.get('test-date')
     # currently this fails, see @skip reason
     testing.assert_frame_equal(df, df2)
Ejemplo n.º 30
0
 def test_put_dataframe_multiindex(self):
     # create some dataframe
     store = OmegaStore(prefix='')
     midx = pd.MultiIndex(levels=[[u'bar', u'baz', u'foo', u'qux'],
                                  [u'one', u'two']],
                          labels=[[0, 0, 1, 1, 2, 2, 3, 3],
                                  [0, 1, 0, 1, 0, 1, 0, 1]],
                          names=[u'first', u'second'])
     df = pd.DataFrame({'x': range(0, len(midx))}, index=midx)
     store.put(df, 'mydata')
     dfx = store.get('mydata')
     assert_frame_equal(df, dfx)
     idxs = list(store.collection('mydata').list_indexes())
     idx_names = [dict(v).get('name') for v in idxs]
     self.assertIn('asc__idx#0_first__asc__idx#1_second', idx_names)