def test_aggregate(self): coll = self.coll df = self.df result = MDataFrame(coll).groupby(['x']).agg({'x': 'sum'}) testagg = df.groupby('x').agg({'x': 'sum'}) testagg = testagg.rename(columns=dict(x='x_sum')) self.assertTrue(result.equals(testagg))
def test_mdataframe(self): coll = self.coll df = self.df mdf = MDataFrame(coll) result = mdf.value self.assertEqual(set(MDataFrame(coll).columns), set(list(df.columns))) self.assertTrue(result.equals(df)) self.assertEqual(mdf.shape, df.shape)
def test_count_multi_columns(self): coll = self.coll df = self.df # add a column mdf = MDataFrame(coll) mdf['z'] = 5 df['z'] = 5 # group by and count counts = mdf.groupby(['x']).count() test_counts = df.groupby('x').count() self.assertTrue(test_counts.equals(counts))
def test_mdataframe_xlarge(self): df = pd.DataFrame({ 'a': list(range(0, int(1e4 + 1))), 'b': list(range(0, int(1e4 + 1))) }) store = self.om.datasets store.put(df, 'mydata-xlarge', append=False) coll = store.collection('mydata-xlarge') result = MDataFrame(coll).value self.assertEqual(set(MDataFrame(coll).columns), set(list(df.columns))) self.assertTrue(result.equals(df))
def test_verylarge_dataframe(self): if not os.environ.get('TEST_LARGE'): return other = pd.DataFrame({ 'x': list(range(0, int(10e6))), 'y': list(range(0, int(10e6))), 'z': list(range(0, int(10e6))) }) coll = self.coll df = self.df result = MDataFrame(coll).value self.assertEqual(set(MDataFrame(coll).columns), set(list(df.columns))) self.assertTrue(result.equals(df))
def test_mdataframe_merge(self): coll = self.coll df = self.df om = self.om other = pd.DataFrame({ 'x': list(range(0, 20)), 'y': list(range(0, 20)), 'z': list(range(0, 20)) }) om.datasets.put(other, 'samplez', append=False) coll2 = om.datasets.collection('samplez') result = MDataFrame(coll).merge(coll2, on='x', how='left').value testdf = df.merge(other, on='x', how='left') self.assertTrue(result.equals(testdf))
def test_unique_series(self): coll = self.coll df = self.df om = self.om om.datasets.put(df, 'uniques', append=False) coll = om.datasets.collection('uniques') result = MDataFrame(coll).x.unique().value self.assertListEqual(list(result), list(df.x.unique()))
def test_aggregate_multi_stats(self): coll = self.coll df = self.df stats = {'x': ['sum', 'mean', 'max', 'min', 'std']} result = MDataFrame(coll).groupby(['x']).agg(stats) testagg = df.groupby('x').agg(stats) testagg.columns = testagg.columns.map(flatten_columns) testagg = testagg[result.columns] assert_frame_equal(testagg, result, check_dtype=False)
def test_mdataframe_merge_right_cartesian(self): coll = self.coll df = self.df om = self.om other = pd.DataFrame({ 'x': list(range(0, 5)), 'y': list(range(0, 5)), 'z': list(range(0, 5)) }) om.datasets.put(other, 'samplez', append=False) om.datasets.put(other, 'samplez', append=True) other = om.datasets.get('samplez') coll2 = om.datasets.collection('samplez') result = MDataFrame(coll).merge(coll2, on='x', how='left', sort=True).value testdf = df.merge(other, on='x', how='left', sort=True) testdf = testdf[result.columns] self.assertTrue(result.equals(testdf))
def test_groupby(self): coll = self.coll df = self.df keys = [] for key, groupdf in MDataFrame(coll).groupby(['x']): x = key.get('x') keys.append(x) subdf = df[df.x == x] assert_frame_equal(subdf, groupdf.value) self.assertEqual(set(keys), set(df.x))
def apply(self, fn, inplace=False, preparefn=None): if inplace: obj = self else: kwargs = self._getcopy_kwargs() kwargs.update(preparefn=preparefn) if isinstance(self, MSeries): obj = MSeries(self.collection, **kwargs) else: obj = MDataFrame(self.collection, **kwargs) obj.apply_fn = fn return obj
def test_mdataframe_merge_filtered(self): coll = self.coll df = self.df om = self.om other = pd.DataFrame({ 'x': list(range(0, 5)), 'y': list(range(0, 5)), 'z': list(range(0, 5)) }) om.datasets.put(other, 'samplez', append=False) om.datasets.put(other, 'samplez', append=True) other = om.datasets.get('samplez') coll2 = om.datasets.collection('samplez') result = MDataFrame(coll).merge(coll2, on='x', how='left', sort=True, filter=dict(x__in=[1, 2])).value q = df['x'].isin([1, 2]) testdf = df[q].merge(other, on='x', how='left', sort=True) testdf = testdf[result.columns] self.assertTrue(result.equals(testdf))
def get(self, name, version=-1, lazy=False, raw=False, parser=None, **kwargs): collection = self.data_store.collection(name) # json_normalize needs a list of dicts to work, not a generator json_normalizer = lambda v: json_normalize([r for r in v]) parser = parser or json_normalizer mdf = MDataFrame(collection, query=kwargs, parser=parser, raw=raw, **kwargs) return mdf if lazy else mdf.value
def test_mdataframe_merge_append(self): ## FIXME this does not work coll = self.coll df = self.df om = self.om other = pd.DataFrame({ 'x': list(range(0, 5)), 'y': list(range(0, 5)), 'z': list(range(0, 5)) }) om.datasets.put(other, 'samplez', append=False) mdf = om.datasets.getl('samplez') mdf.append(mdf) coll2 = om.datasets.collection('samplez') result = MDataFrame(coll).merge(coll2, on='x', how='left', suffixes=('', '')).value testdf = df.append(other, ignore_index=True) testdf = testdf[result.columns] assert_frame_equal(result, testdf)
def test_mdataframe_count(self): coll = self.coll df = self.df mdf = MDataFrame(coll) assert_series_equal(df.count(), mdf.count()) self.assertEqual(len(mdf), len(mdf))
def test_mdataframe_column_attribute(self): coll = self.coll df = self.df result = MDataFrame(coll).y.value self.assertTrue(df.y.equals(result))
def test_mdataframe_columns_slice(self): coll = self.coll df = self.df result = MDataFrame(coll)[['x', 'y']].value self.assertTrue(df[['x', 'y']].equals(result))
def test_mdataframe_sort(self): coll = self.coll df = self.df result = MDataFrame(coll).sort(['-x', '-y']).value df = df.sort_values(['x', 'y'], ascending=[False, False]) assert_frame_equal(df, result)
def test_count_column(self): coll = self.coll df = self.df result = MDataFrame(coll).groupby(['x']).x.count() testgroup = df.groupby('x').x.count() self.assertTrue(result.equals(testgroup))
def test_count(self): coll = self.coll df = self.df counts = MDataFrame(coll).groupby(['x']).count() test_counts = df.groupby('x').count() self.assertTrue(test_counts.equals(counts))