def test_dataframe_join_cats(): lhs = DataFrame() lhs['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc')) lhs['b'] = bb = np.arange(len(lhs)) lhs = lhs.set_index('a') rhs = DataFrame() rhs['a'] = pd.Categorical(list('abcac'), categories=list('abc')) rhs['c'] = cc = np.arange(len(rhs)) rhs = rhs.set_index('a') got = lhs.join(rhs) expect = lhs.to_pandas().join(rhs.to_pandas()) # Note: pandas make a object Index after joining pd.util.testing.assert_frame_equal( got.sort_values(by='b') .to_pandas() .sort_index() .reset_index(drop=True), expect.reset_index(drop=True)) # Just do some rough checking here. assert list(got.columns) == ['b', 'c'] assert len(got) > 0 assert set(got.index.values) & set('abc') assert set(got['b']) & set(bb) assert set(got['c']) & set(cc)
def test_dataframe_join_suffix(): np.random.seed(0) df = DataFrame() for k in 'abc': df[k] = np.random.randint(0, 5, 5) left = df.set_index('a') right = df.set_index('c') with pytest.raises(ValueError) as raises: left.join(right) raises.match("there are overlapping columns but lsuffix" " and rsuffix are not defined") got = left.join(right, lsuffix='_left', rsuffix='_right', sort=True) # Get expected value pddf = df.to_pandas() expect = pddf.set_index('a').join(pddf.set_index('c'), lsuffix='_left', rsuffix='_right') # Check assert list(expect.columns) == list(got.columns) assert np.all(expect.index.values == got.index.values) for k in expect.columns: _check_series(expect[k], got[k])
def test_dataframe_join_how(aa, bb, how, method): df = DataFrame() df['a'] = aa df['b'] = bb def work_pandas(df): ts = timer() df1 = df.set_index('a') df2 = df.set_index('b') joined = df1.join(df2, how=how, sort=True) te = timer() print('timing', type(df), te - ts) return joined def work_gdf(df): ts = timer() df1 = df.set_index('a') df2 = df.set_index('b') joined = df1.join(df2, how=how, sort=True, method=method) te = timer() print('timing', type(df), te - ts) return joined expect = work_pandas(df.to_pandas()) got = work_gdf(df) expecto = expect.copy() goto = got.copy() # Type conversion to handle NoneType expectb = expect.b expecta = expect.a gotb = got.b gota = got.a got.drop_column('b') got.add_column('b', gotb.astype(np.float64).fillna(np.nan)) got.drop_column('a') got.add_column('a', gota.astype(np.float64).fillna(np.nan)) expect.drop(['b'], axis=1) expect['b'] = expectb.astype(np.float64).fillna(np.nan) expect.drop(['a'], axis=1) expect['a'] = expecta.astype(np.float64).fillna(np.nan) # print(expect) # print(got.to_string(nrows=None)) assert list(expect.columns) == list(got.columns) assert np.all(expect.index.values == got.index.values) if(how != 'outer'): pd.util.testing.assert_frame_equal( got.to_pandas().sort_values(['b', 'a']).reset_index(drop=True), expect.sort_values(['b', 'a']).reset_index(drop=True)) # if(how=='right'): # _sorted_check_series(expect['a'], expect['b'], # got['a'], got['b']) # else: # _sorted_check_series(expect['b'], expect['a'], got['b'], # got['a']) else: _check_series(expecto['b'], goto['b']) _check_series(expecto['a'], goto['a'])
def test_dataframe_append_to_empty(): pdf = pd.DataFrame() pdf['a'] = [] pdf['b'] = [1, 2, 3] gdf = DataFrame() gdf['a'] = [] gdf['b'] = [1, 2, 3] pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)
def test_dataframe_multi_column_join(): np.random.seed(0) # Make GDF df_left = DataFrame() nelem = 500 df_left['key1'] = np.random.randint(0, 30, nelem) df_left['key2'] = np.random.randint(0, 50, nelem) df_left['val1'] = np.arange(nelem) df_right = DataFrame() nelem = 500 df_right['key1'] = np.random.randint(0, 30, nelem) df_right['key2'] = np.random.randint(0, 50, nelem) df_right['val1'] = np.arange(nelem) # Make pandas DF pddf_left = df_left.to_pandas() pddf_right = df_right.to_pandas() # print(pddf_left) # print(pddf_right) # Expected result pddf_joined = pddf_left.merge(pddf_right, on=['key1', 'key2'], how='left', sort=True) # print(pddf_joined) # Test (doesn't check for ordering) join_result = df_left.merge(df_right, on=['key1', 'key2'], how='left') for col in list(pddf_joined.columns): if(col.count('_y') > 0): join_result[col] = (join_result[col] .astype(np.float64) .fillna(np.nan)) pd.util.testing.assert_frame_equal( join_result .to_pandas() .sort_values(list(pddf_joined.columns)) .reset_index(drop=True), pddf_joined)
def test_df_cat_sort_index(): df = DataFrame() df['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc')) df['b'] = np.arange(len(df)) got = df.set_index('a').sort_index() expect = df.to_pandas().set_index('a').sort_index() assert list(expect.columns) == list(got.columns) assert list(expect.index.values) == list(got.index.values) np.testing.assert_array_equal(expect.index.values, got.index.values) np.testing.assert_array_equal(expect['b'].values, got['b'].to_array())
def test_to_pandas(): df = DataFrame() df['a'] = np.arange(10, dtype=np.int32) df['b'] = np.arange(10, 20, dtype=np.float64) pdf = df.to_pandas() assert tuple(df.columns) == tuple(pdf.columns) assert df['a'].dtype == pdf['a'].dtype assert df['b'].dtype == pdf['b'].dtype assert len(df['a']) == len(pdf['a']) assert len(df['b']) == len(pdf['b'])
def test_dataframe_masked_slicing(nelem, slice_start, slice_end): gdf = DataFrame() gdf['a'] = list(range(nelem)) gdf['b'] = list(range(nelem, 2 * nelem)) gdf['a'] = gdf['a'].set_mask(utils.random_bitmask(nelem)) gdf['b'] = gdf['b'].set_mask(utils.random_bitmask(nelem)) def do_slice(x): return x[slice_start:slice_end] expect = do_slice(gdf.to_pandas()) got = do_slice(gdf).to_pandas() pd.testing.assert_frame_equal(expect, got)
def test_to_pandas(): df = DataFrame() df['a'] = np.arange(10, dtype=np.int32) df['b'] = np.arange(10, 20, dtype=np.float64) pdf = df.to_pandas() assert tuple(df.columns) == tuple(pdf.columns) assert df['a'].dtype == pdf['a'].dtype assert df['b'].dtype == pdf['b'].dtype assert len(df['a']) == len(pdf['a']) assert len(df['b']) == len(pdf['b'])
def test_groupby_apply(): np.random.seed(0) df = DataFrame() nelem = 20 df['key1'] = np.random.randint(0, 3, nelem) df['key2'] = np.random.randint(0, 2, nelem) df['val1'] = np.random.random(nelem) df['val2'] = np.random.random(nelem) expect_grpby = df.to_pandas().groupby(['key1', 'key2'], as_index=False) got_grpby = df.groupby(['key1', 'key2']) def foo(df): df['out'] = df['val1'] + df['val2'] return df expect = expect_grpby.apply(foo) expect = expect.sort_values(['key1', 'key2']).reset_index(drop=True) got = got_grpby.apply(foo).to_pandas() pd.util.testing.assert_frame_equal(expect, got)
def test_groupby_apply_grouped(): from numba import cuda np.random.seed(0) df = DataFrame() nelem = 20 df['key1'] = np.random.randint(0, 3, nelem) df['key2'] = np.random.randint(0, 2, nelem) df['val1'] = np.random.random(nelem) df['val2'] = np.random.random(nelem) expect_grpby = df.to_pandas().groupby(['key1', 'key2'], as_index=False) got_grpby = df.groupby(['key1', 'key2']) def foo(key1, val1, com1, com2): for i in range(cuda.threadIdx.x, len(key1), cuda.blockDim.x): com1[i] = key1[i] * 10000 + val1[i] com2[i] = i got = got_grpby.apply_grouped(foo, incols=['key1', 'val1'], outcols={ 'com1': np.float64, 'com2': np.int32 }, tpb=8) got = got.to_pandas() # Get expected result by emulating the operation in pandas def emulate(df): df['com1'] = df.key1 * 10000 + df.val1 df['com2'] = np.arange(len(df), dtype=np.int32) return df expect = expect_grpby.apply(emulate) expect = expect.sort_values(['key1', 'key2']).reset_index(drop=True) pd.util.testing.assert_frame_equal(expect, got)
def test_dataframe_join_how(aa, bb, how): df = DataFrame() df['a'] = aa df['b'] = bb def work(df): ts = timer() df1 = df.set_index('a') df2 = df.set_index('b') joined = df1.join(df2, how=how, sort=True) te = timer() print('timing', type(df), te - ts) return joined expect = work(df.to_pandas()) got = work(df) # print(expect) # print(got.to_string(nrows=None)) assert list(expect.columns) == list(got.columns) assert np.all(expect.index.values == got.index.values) _check_series(expect['b'], got['b']) _check_series(expect['a'], got['a'])
def test_dataframe_join_suffix(): np.random.seed(0) df = DataFrame() for k in 'abc': df[k] = np.random.randint(0, 5, 5) left = df.set_index('a') right = df.set_index('c') with pytest.raises(ValueError) as raises: left.join(right) raises.match("there are overlapping columns but lsuffix" " and rsuffix are not defined") got = left.join(right, lsuffix='_left', rsuffix='_right') # Get expected value pddf = df.to_pandas() expect = pddf.set_index('a').join(pddf.set_index('c'), lsuffix='_left', rsuffix='_right') # Check assert list(expect.columns) == list(got.columns) assert np.all(expect.index.values == got.index.values) for k in expect.columns: _check_series(expect[k], got[k])
def test_dataframe_join_how(aa, bb, how): df = DataFrame() df['a'] = aa df['b'] = bb def work(df): ts = timer() df1 = df.set_index('a') df2 = df.set_index('b') joined = df1.join(df2, how=how, sort=True) te = timer() print('timing', type(df), te - ts) return joined expect = work(df.to_pandas()) got = work(df) # print(expect) # print(got.to_string(nrows=None)) assert list(expect.columns) == list(got.columns) assert np.all(expect.index.values == got.index.values) _check_series(expect['b'], got['b']) _check_series(expect['a'], got['a'])