def test_dataframe_join_cats(): lhs = DataFrame() lhs['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc')) lhs['b'] = bb = np.arange(len(lhs)) lhs = lhs.set_index('a') rhs = DataFrame() rhs['a'] = pd.Categorical(list('abcac'), categories=list('abc')) rhs['c'] = cc = np.arange(len(rhs)) rhs = rhs.set_index('a') got = lhs.join(rhs) expect = lhs.to_pandas().join(rhs.to_pandas()) # Note: pandas make a object Index after joining pd.util.testing.assert_frame_equal( got.sort_values(by='b').to_pandas().sort_index().reset_index( drop=True), expect.reset_index(drop=True)) # Just do some rough checking here. assert list(got.columns) == ['b', 'c'] assert len(got) > 0 assert set(got.index.values) & set('abc') assert set(got['b']) & set(bb) assert set(got['c']) & set(cc)
def test_string_join_non_key_nulls(str_data_nulls): str_data = ["a", "b", "c", "d", "e"] other_data = [1, 2, 3, 4, 5] other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)] pdf = pd.DataFrame() gdf = DataFrame() pdf["vals"] = pd.Series(str_data, dtype="str") gdf["vals"] = Series(str_data, dtype="str") pdf["key"] = other_data gdf["key"] = other_data pdf2 = pd.DataFrame() gdf2 = DataFrame() pdf2["vals"] = pd.Series(str_data_nulls, dtype="str") gdf2["vals"] = Series(str_data_nulls, dtype="str") pdf2["key"] = pd.Series(other_data_nulls, dtype="int64") gdf2["key"] = Series(other_data_nulls, dtype="int64") expect = pdf.merge(pdf2, on="key", how="left") got = gdf.merge(gdf2, on="key", how="left") if len(expect) == 0 and len(got) == 0: expect = expect.reset_index(drop=True) got = got[expect.columns] assert_eq(expect, got)
def test_string_join_non_key_nulls(str_data_nulls): str_data = ['a', 'b', 'c', 'd', 'e'] other_data = [1, 2, 3, 4, 5] other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)] pdf = pd.DataFrame() gdf = DataFrame() pdf['vals'] = pd.Series(str_data, dtype='str') gdf['vals'] = Series(str_data, dtype='str') pdf['key'] = other_data gdf['key'] = other_data pdf2 = pd.DataFrame() gdf2 = DataFrame() pdf2['vals'] = pd.Series(str_data_nulls, dtype='str') gdf2['vals'] = Series(str_data_nulls, dtype='str') pdf2['key'] = pd.Series(other_data_nulls, dtype='int64') gdf2['key'] = Series(other_data_nulls, dtype='int64') expect = pdf.merge(pdf2, on='key', how='left') got = gdf.merge(gdf2, on='key', how='left') if len(expect) == 0 and len(got) == 0: expect = expect.reset_index(drop=True) got = got[expect.columns] assert_eq(expect, got)
def test_dataframe_empty_merge(): gdf1 = DataFrame([("a", []), ("b", [])]) gdf2 = DataFrame([("a", []), ("c", [])]) expect = DataFrame([("a", []), ("b", []), ("c", [])]) got = gdf1.merge(gdf2, how="left", on=["a"]) assert_eq(expect, got)
def test_dataframe_empty_merge(): gdf1 = DataFrame([('a', []), ('b', [])]) gdf2 = DataFrame([('a', []), ('c', [])]) expect = DataFrame([('a', []), ('b', []), ('c', [])]) got = gdf1.merge(gdf2, how='left', on=['a']) assert_eq(expect, got)
def test_dataframe_merge_on(on): np.random.seed(0) # Make cuDF df_left = DataFrame() nelem = 500 df_left['key1'] = np.random.randint(0, 40, nelem) df_left['key2'] = np.random.randint(0, 50, nelem) df_left['left_val'] = np.arange(nelem) df_right = DataFrame() nelem = 500 df_right['key1'] = np.random.randint(0, 30, nelem) df_right['key2'] = np.random.randint(0, 50, nelem) df_right['right_val'] = np.arange(nelem) # Make pandas DF pddf_left = df_left.to_pandas() pddf_right = df_right.to_pandas() # Expected result (from pandas) pddf_joined = pddf_left.merge(pddf_right, on=on, how='left') # Test (from cuDF; doesn't check for ordering) join_result = df_left.merge(df_right, on=on, how='left') join_result_cudf = cudf.merge(df_left, df_right, on=on, how='left') join_result['right_val'] = (join_result['right_val'].astype( np.float64).fillna(np.nan)) join_result_cudf['right_val'] = (join_result_cudf['right_val'].astype( np.float64).fillna(np.nan)) for col in list(pddf_joined.columns): if (col.count('_y') > 0): join_result[col] = (join_result[col].astype(np.float64).fillna( np.nan)) join_result_cudf[col] = (join_result_cudf[col].astype( np.float64).fillna(np.nan)) # Test dataframe equality (ignore order of rows and columns) cdf_result = join_result.to_pandas() \ .sort_values(list(pddf_joined.columns)) \ .reset_index(drop=True) pdf_result = pddf_joined.sort_values(list(pddf_joined.columns)) \ .reset_index(drop=True) pd.util.testing.assert_frame_equal(cdf_result, pdf_result, check_like=True) merge_func_result_cdf = join_result_cudf.to_pandas() \ .sort_values( list(pddf_joined.columns)) \ .reset_index(drop=True) pd.util.testing.assert_frame_equal(merge_func_result_cdf, cdf_result, check_like=True)
def test_onehot_get_dummies_simple(): df = DataFrame({'x': np.arange(10)}) original = df.copy() encoded = get_dummies(df, prefix='test') assert df == original # the original df should be unchanged cols = list(encoded.columns)[1:] actual = DataFrame(dict(zip(cols, np.eye(len(cols))))) assert (encoded.loc[:, cols] == actual).all().all()
def test_dataframe_to_string(): with set_options(formatting={'nrows': 5, 'ncols': 8}): # Test basic df = DataFrame([('a', [1, 2, 3, 4, 5, 6]), ('b', [11, 12, 13, 14, 15, 16])]) string = str(df) print(string) assert string.splitlines()[-1] == '[1 more rows]' # Test skipped columns df = DataFrame([('a', [1, 2, 3, 4, 5, 6]), ('b', [11, 12, 13, 14, 15, 16]), ('c', [11, 12, 13, 14, 15, 16]), ('d', [11, 12, 13, 14, 15, 16])]) string = df.to_string(ncols=3) print(string) assert string.splitlines()[-2] == '[1 more rows]' assert string.splitlines()[-1] == '[1 more columns]' # Test masked df = DataFrame([('a', [1, 2, 3, 4, 5, 6]), ('b', [11, 12, 13, 14, 15, 16])]) data = np.arange(6) mask = np.zeros(1, dtype=np.uint8) mask[0] = 0b00101101 masked = Series.from_masked_array(data, mask) assert masked.null_count == 2 df['c'] = masked # check data values = list(masked) validids = [0, 2, 3, 5] densearray = masked.to_array() np.testing.assert_equal(data[validids], densearray) # valid position is corret for i in validids: assert data[i] == values[i] # null position is correct for i in range(len(values)): if i not in validids: assert values[i] is None got = df.to_string(nrows=None) print(got) expect = ''' a b c 0 1 11 0 1 2 12 2 3 13 2 3 4 14 3 4 5 15 5 6 16 5 ''' # values should match despite whitespace difference assert got.split() == expect.split()
def test_label_encode_drop_one(): random.seed(0) np.random.seed(0) df = DataFrame() # initialize data frame df["cats"] = np.random.randint(7, size=10, dtype=np.int32) vals = list(df["cats"].unique()) # drop 1 randomly del vals[random.randrange(len(vals))] lab = dict(zip(vals, list(range(len(vals))))) # label encode series ncol = df["cats"].label_encoding(cats=vals, dtype="float32") arr = ncol.to_array() # verify labels of new column for i in range(arr.size): # assuming -1 is used for missing value np.testing.assert_equal(arr[i], lab.get(df.cats[i], -1)) # label encode data frame df2 = df.label_encoding(column="cats", prefix="cats", cats=vals, dtype="float32") assert df2.columns[0] == "cats" assert df2.columns[1] == "cats_labels"
def test_dataframe_join_suffix(): np.random.seed(0) df = DataFrame() for k in 'abc': df[k] = np.random.randint(0, 5, 5) left = df.set_index('a') right = df.set_index('c') with pytest.raises(ValueError) as raises: left.join(right) raises.match("there are overlapping columns but lsuffix" " and rsuffix are not defined") got = left.join(right, lsuffix='_left', rsuffix='_right', sort=True) # Get expected value pddf = df.to_pandas() expect = pddf.set_index('a').join(pddf.set_index('c'), lsuffix='_left', rsuffix='_right') # Check assert list(expect.columns) == list(got.columns) assert np.all(expect.index.values == got.index.values) for k in expect.columns: _check_series(expect[k].fillna(-1), got[k].fillna(-1))
def read_ipc_to_DF(self, source): ''' description: Read arrow file from another dataframe already in the gpu input: source: file path return: pandas dataframe ''' try: with open(source + '.pickle', 'rb') as handle: buffer = eval(pickle.load(handle)) with open(source + '-col.pickle', 'rb') as handle: columns = list(pickle.load(handle)) self.data_gpu = DataFrame() for i, j in enumerate(buffer): temp_ipc_handler = pickle.loads(j) with temp_ipc_handler as temp_nd_array: np_arr = np.zeros((temp_nd_array.size), dtype=temp_nd_array.dtype) np_arr_gpu = cuda.to_device(np_arr) np_arr_gpu.copy_to_device(temp_nd_array) self.data_gpu[columns[i]] = cudf.Series(np_arr_gpu) self.back_up_dimension = self.data_gpu except Exception as e: del (self.data_gpu) del (self.back_up_dimension) gc.collect() return "Exception *** in cudf read_ipc_to_DF():" + str(e) return "data read successfully"
def test_dataframe_replace_with_nulls(): # numerical pdf1 = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, 3]}) gdf1 = DataFrame.from_pandas(pdf1) pdf2 = pdf1.replace(0, 4) gdf2 = gdf1.replace(0, None).fillna(4) pd.testing.assert_frame_equal(gdf2.to_pandas(), pdf2) # list input pdf6 = pdf1.replace([0, 1], [4, 5]) gdf6 = gdf1.replace([0, 1], [4, None]).fillna(5) pd.testing.assert_frame_equal(gdf6.to_pandas(), pdf6) pdf7 = pdf1.replace([0, 1], 4) gdf7 = gdf1.replace([0, 1], None).fillna(4) pd.testing.assert_frame_equal(gdf7.to_pandas(), pdf7) # dict input: pdf8 = pdf1.replace({"a": 0, "b": 0}, {"a": 4, "b": 5}) gdf8 = gdf1.replace({"a": 0, "b": 0}, {"a": None, "b": 5}).fillna(4) pd.testing.assert_frame_equal(gdf8.to_pandas(), pdf8) gdf1 = DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, None]}) gdf9 = gdf1.replace([0, 1], [4, 5]).fillna(3) pd.testing.assert_frame_equal(gdf9.to_pandas(), pdf6)
def test_dataframe_hash_partition(nrows, nparts, nkeys): np.random.seed(123) gdf = DataFrame() keycols = [] for i in range(nkeys): keyname = 'key{}'.format(i) gdf[keyname] = np.random.randint(0, 7 - i, nrows) keycols.append(keyname) gdf['val1'] = np.random.randint(0, nrows * 2, nrows) got = gdf.partition_by_hash(keycols, nparts=nparts) # Must return a list assert isinstance(got, list) # Must have correct number of partitions assert len(got) == nparts # All partitions must be DataFrame type assert all(isinstance(p, DataFrame) for p in got) # Check that all partitions have unique keys part_unique_keys = set() for p in got: if len(p): # Take rows of the keycolums and build a set of the key-values unique_keys = set(map(tuple, p.as_matrix(columns=keycols))) # Ensure that none of the key-values have occurred in other groups assert not (unique_keys & part_unique_keys) part_unique_keys |= unique_keys assert len(part_unique_keys)
def test_dataframe_as_gpu_matrix_null_values(): df = DataFrame() nelem = 123 na = -10000 refvalues = {} for k in 'abcd': df[k] = data = np.random.random(nelem) bitmask = utils.random_bitmask(nelem) df[k] = df[k].set_mask(bitmask) boolmask = np.asarray(utils.expand_bits_to_bytes(bitmask)[:nelem], dtype=np.bool_) data[~boolmask] = na refvalues[k] = data # Check null value causes error with pytest.raises(ValueError) as raises: df.as_gpu_matrix() raises.match("column 'a' has null values") for k in df.columns: df[k] = df[k].fillna(na) mat = df.as_gpu_matrix().copy_to_host() for i, k in enumerate(df.columns): np.testing.assert_array_equal(refvalues[k], mat[:, i])
def test_string_join_non_key(str_data, num_cols, how, how_raise): other_data = [1, 2, 3, 4, 5][:len(str_data)] pdf = pd.DataFrame() gdf = DataFrame() for i in range(num_cols): pdf[i] = pd.Series(str_data, dtype="str") gdf[i] = Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data pdf2 = pdf.copy() gdf2 = gdf.copy() expectation = raise_builder([how_raise], NotImplementedError) with expectation: expect = pdf.merge(pdf2, on=["a"], how=how) got = gdf.merge(gdf2, on=["a"], how=how) if len(expect) == 0 and len(got) == 0: expect = expect.reset_index(drop=True) got = got[expect.columns] assert_eq(expect, got)
def test_dataframe_setitem_from_masked_object(): ary = np.random.randn(100) mask = np.zeros(100, dtype=bool) mask[:20] = True np.random.shuffle(mask) ary[mask] = np.nan test1 = Series(ary) assert (test1.has_null_mask) assert (test1.null_count == 20) test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary})) assert (test2['a'].has_null_mask) assert (test2['a'].null_count == 20) gpu_ary = rmm.to_device(ary) test3 = Series(gpu_ary) assert (test3.has_null_mask) assert (test3.null_count == 20) test4 = DataFrame() lst = [1, 2, None, 4, 5, 6, None, 8, 9] test4['lst'] = lst assert (test4['lst'].has_null_mask) assert (test4['lst'].null_count == 2)
def test_assign(): gdf = DataFrame({'x': [1, 2, 3]}) gdf2 = gdf.assign(y=gdf.x + 1) assert list(gdf.columns) == ['x'] assert list(gdf2.columns) == ['x', 'y'] np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4])
def test_dataframe_empty_to_string(): # Test for printing empty dataframe df = DataFrame() got = df.to_string() print(got) expect = "Empty DataFrame\nColumns: []\nIndex: []\n" # values should match despite whitespace difference assert got.split() == expect.split()
def test_dataframe_astype(nelem): df = DataFrame() data = np.asarray(range(nelem), dtype=np.int32) df['a'] = data assert df['a'].dtype is np.dtype(np.int32) df['b'] = df['a'].astype(np.float32) assert df['b'].dtype is np.dtype(np.float32) np.testing.assert_equal(df['a'].to_array(), df['b'].to_array())
def test_string_slice(): df = DataFrame({"a": ["hello", "world"]}) pdf = pd.DataFrame({"a": ["hello", "world"]}) a_slice_got = df.a.str.slice(0, 2) a_slice_expected = pdf.a.str.slice(0, 2) assert isinstance(a_slice_got, Series) assert_eq(a_slice_expected, a_slice_got)
def test_query_local_dict(): df = DataFrame() df['a'] = aa = np.arange(100) expr = "a < @val" got = df.query(expr, local_dict={'val': 10}) np.testing.assert_array_equal(aa[aa < 10], got['a'].to_array()) # test for datetime df = DataFrame() data = np.array(['2018-10-07', '2018-10-08'], dtype='datetime64') df['datetimes'] = data search_date = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d') expr = 'datetimes==@search_date' got = df.query(expr, local_dict={'search_date': search_date}) np.testing.assert_array_equal(data[1], got['datetimes'].to_array())
def test_pickle_dataframe_numeric(): np.random.seed(0) df = DataFrame() nelem = 10 df["keys"] = np.arange(nelem, dtype=np.float64) df["vals"] = np.random.random(nelem) check_serialization(df)
def test_query_local_dict(): df = DataFrame() df["a"] = aa = np.arange(100) expr = "a < @val" got = df.query(expr, local_dict={"val": 10}) np.testing.assert_array_equal(aa[aa < 10], got["a"].to_array()) # test for datetime df = DataFrame() data = np.array(["2018-10-07", "2018-10-08"], dtype="datetime64") df["datetimes"] = data search_date = datetime.datetime.strptime("2018-10-08", "%Y-%m-%d") expr = "datetimes==@search_date" got = df.query(expr, local_dict={"search_date": search_date}) np.testing.assert_array_equal(data[1], got["datetimes"].to_array())
def test_pickle_dataframe_categorical(): np.random.seed(0) df = DataFrame() df["keys"] = pd.Categorical("aaabababac") df["vals"] = np.random.random(len(df)) check_serialization(df)
def test_to_records_noindex(): df = DataFrame() df["a"] = aa = np.arange(10, dtype=np.int32) df["b"] = bb = np.arange(10, 20, dtype=np.float64) rec = df.to_records(index=False) assert rec.dtype.names == ("a", "b") np.testing.assert_array_equal(rec["a"], aa) np.testing.assert_array_equal(rec["b"], bb)
def test_groupby_apply_basic_agg_single_column(): gdf = DataFrame() gdf["key"] = [0, 0, 1, 1, 2, 2, 0] gdf["val"] = [0, 1, 2, 3, 4, 5, 6] gdf["mult"] = gdf["key"] * gdf["val"] pdf = gdf.to_pandas() gdg = gdf.groupby(["key", "val"]).mult.sum() pdg = pdf.groupby(["key", "val"]).mult.sum() assert_eq(pdg, gdg)
def test_dataframe_empty_concat(): gdf1 = DataFrame() gdf1['a'] = [] gdf1['b'] = [] gdf2 = gdf1.copy() gdf3 = gd.concat([gdf1, gdf2]) assert len(gdf3) == 0 assert len(gdf3.columns) == 2
def test_dataframe_emptycolumns_to_string(): # Test for printing dataframe having empty columns df = DataFrame() df['a'] = [] df['b'] = [] got = df.to_string() print(got) expect = "Empty DataFrame\nColumns: ['a', 'b']\nIndex: []\n" # values should match despite whitespace difference assert got.split() == expect.split()
def test_groupby_apply_basic_agg_single_column(): gdf = DataFrame() gdf['key'] = [0, 0, 1, 1, 2, 2, 0] gdf['val'] = [0, 1, 2, 3, 4, 5, 6] gdf['mult'] = gdf['key'] * gdf['val'] pdf = gdf.to_pandas() gdg = gdf.groupby(['key', 'val']).mult.sum() pdg = pdf.groupby(['key', 'val']).mult.sum() assert_eq(pdg, gdg)
def test_dataframe_append_to_empty(): pdf = pd.DataFrame() pdf['a'] = [] pdf['b'] = [1, 2, 3] gdf = DataFrame() gdf['a'] = [] gdf['b'] = [1, 2, 3] pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)