def test_dataframe_replace_with_nulls(): # numerical pdf1 = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, 3]}) gdf1 = DataFrame.from_pandas(pdf1) pdf2 = pdf1.replace(0, 4) gdf2 = gdf1.replace(0, None).fillna(4) pd.testing.assert_frame_equal(gdf2.to_pandas(), pdf2) # list input pdf6 = pdf1.replace([0, 1], [4, 5]) gdf6 = gdf1.replace([0, 1], [4, None]).fillna(5) pd.testing.assert_frame_equal(gdf6.to_pandas(), pdf6) pdf7 = pdf1.replace([0, 1], 4) gdf7 = gdf1.replace([0, 1], None).fillna(4) pd.testing.assert_frame_equal(gdf7.to_pandas(), pdf7) # dict input: pdf8 = pdf1.replace({"a": 0, "b": 0}, {"a": 4, "b": 5}) gdf8 = gdf1.replace({"a": 0, "b": 0}, {"a": None, "b": 5}).fillna(4) pd.testing.assert_frame_equal(gdf8.to_pandas(), pdf8) gdf1 = DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, None]}) gdf9 = gdf1.replace([0, 1], [4, 5]).fillna(3) pd.testing.assert_frame_equal(gdf9.to_pandas(), pdf6)
def read_ipc_to_DF(self,source): ''' description: Read arrow file from another dataframe already in the gpu input: source: file path return: pandas dataframe ''' try: with open(source+'.pickle', 'rb') as handle: buffer = eval(pickle.load(handle)) with open(source+'-col.pickle', 'rb') as handle: columns = list(pickle.load(handle)) self.data_gpu = DataFrame() for i,j in enumerate(buffer): temp_ipc_handler = pickle.loads(j) with temp_ipc_handler as temp_nd_array: np_arr = np.zeros((temp_nd_array.size), dtype=temp_nd_array.dtype) np_arr_gpu = cuda.to_device(np_arr) np_arr_gpu.copy_to_device(temp_nd_array) self.data_gpu[columns[i]] = cudf.Series(np_arr_gpu) self.back_up_dimension = self.data_gpu except Exception as e: del(self.data_gpu) del(self.back_up_dimension) gc.collect() return "Exception *** in cudf read_ipc_to_DF():"+str(e) return "data read successfully"
def test_string_join_non_key(str_data, num_cols, how, how_raise): other_data = [1, 2, 3, 4, 5][:len(str_data)] pdf = pd.DataFrame() gdf = DataFrame() for i in range(num_cols): pdf[i] = pd.Series(str_data, dtype='str') gdf[i] = Series(str_data, dtype='str') pdf['a'] = other_data gdf['a'] = other_data pdf2 = pdf.copy() gdf2 = gdf.copy() expectation = raise_builder([how_raise], NotImplementedError) with expectation: expect = pdf.merge(pdf2, on=['a'], how=how) got = gdf.merge(gdf2, on=['a'], how=how) if len(expect) == 0 and len(got) == 0: expect = expect.reset_index(drop=True) got = got[expect.columns] assert_eq(expect, got)
def test_string_join_non_key_nulls(str_data_nulls): str_data = ['a', 'b', 'c', 'd', 'e'] other_data = [1, 2, 3, 4, 5] other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)] pdf = pd.DataFrame() gdf = DataFrame() pdf['vals'] = pd.Series(str_data, dtype='str') gdf['vals'] = Series(str_data, dtype='str') pdf['key'] = other_data gdf['key'] = other_data pdf2 = pd.DataFrame() gdf2 = DataFrame() pdf2['vals'] = pd.Series(str_data_nulls, dtype='str') gdf2['vals'] = Series(str_data_nulls, dtype='str') pdf2['key'] = pd.Series(other_data_nulls, dtype='int64') gdf2['key'] = Series(other_data_nulls, dtype='int64') expect = pdf.merge(pdf2, on='key', how='left') got = gdf.merge(gdf2, on='key', how='left') if len(expect) == 0 and len(got) == 0: expect = expect.reset_index(drop=True) got = got[expect.columns] assert_eq(expect, got)
def test_dataframe_join_suffix(): np.random.seed(0) df = DataFrame() for k in "abc": df[k] = np.random.randint(0, 5, 5) left = df.set_index("a") right = df.set_index("c") with pytest.raises(ValueError) as raises: left.join(right) raises.match("there are overlapping columns but lsuffix" " and rsuffix are not defined") got = left.join(right, lsuffix="_left", rsuffix="_right", sort=True) # Get expected value pddf = df.to_pandas() expect = pddf.set_index("a").join(pddf.set_index("c"), lsuffix="_left", rsuffix="_right") # Check assert list(expect.columns) == list(got.columns) assert np.all(expect.index.values == got.index.values) for k in expect.columns: _check_series(expect[k].fillna(-1), got[k].fillna(-1))
def test_dataframe_hash_partition(nrows, nparts, nkeys): np.random.seed(123) gdf = DataFrame() keycols = [] for i in range(nkeys): keyname = 'key{}'.format(i) gdf[keyname] = np.random.randint(0, 7 - i, nrows) keycols.append(keyname) gdf['val1'] = np.random.randint(0, nrows * 2, nrows) got = gdf.partition_by_hash(keycols, nparts=nparts) # Must return a list assert isinstance(got, list) # Must have correct number of partitions assert len(got) == nparts # All partitions must be DataFrame type assert all(isinstance(p, DataFrame) for p in got) # Check that all partitions have unique keys part_unique_keys = set() for p in got: if len(p): # Take rows of the keycolums and build a set of the key-values unique_keys = set(map(tuple, p.as_matrix(columns=keycols))) # Ensure that none of the key-values have occurred in other groups assert not (unique_keys & part_unique_keys) part_unique_keys |= unique_keys assert len(part_unique_keys)
def test_dataframe_pairs_of_triples(pairs, max, rows, how): np.random.seed(0) pdf_left = pd.DataFrame() pdf_right = pd.DataFrame() for left_column in pairs[0]: pdf_left[left_column] = np.random.randint(0, max, rows) for right_column in pairs[1]: pdf_right[right_column] = np.random.randint(0, max, rows) gdf_left = DataFrame.from_pandas(pdf_left) gdf_right = DataFrame.from_pandas(pdf_right) if not set(pdf_left.columns).intersection(pdf_right.columns): with pytest.raises(pd.core.reshape.merge.MergeError) as raises: pdf_left.merge(pdf_right) raises.match("No common columns to perform merge on") with pytest.raises(ValueError) as raises: gdf_left.merge(gdf_right) raises.match("No common columns to perform merge on") elif not [value for value in pdf_left if value in pdf_right]: with pytest.raises(pd.core.reshape.merge.MergeError) as raises: pdf_left.merge(pdf_right) raises.match("No common columns to perform merge on") with pytest.raises(ValueError) as raises: gdf_left.merge(gdf_right) raises.match("No common columns to perform merge on") else: pdf_result = pdf_left.merge(pdf_right, how=how) gdf_result = gdf_left.merge(gdf_right, how=how) assert np.array_equal(gdf_result.columns, pdf_result.columns) for column in gdf_result: assert np.array_equal(gdf_result[column].fillna(-1).sort_values(), pdf_result[column].fillna(-1).sort_values())
def test_assign(): gdf = DataFrame({'x': [1, 2, 3]}) gdf2 = gdf.assign(y=gdf.x + 1) assert list(gdf.columns) == ['x'] assert list(gdf2.columns) == ['x', 'y'] np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4])
def test_dataframe_setitem_from_masked_object(): ary = np.random.randn(100) mask = np.zeros(100, dtype=bool) mask[:20] = True np.random.shuffle(mask) ary[mask] = np.nan test1 = Series(ary) assert(test1.has_null_mask) assert(test1.null_count == 20) test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary})) assert(test2['a'].has_null_mask) assert(test2['a'].null_count == 20) gpu_ary = rmm.to_device(ary) test3 = Series(gpu_ary) assert(test3.has_null_mask) assert(test3.null_count == 20) test4 = DataFrame() lst = [1, 2, None, 4, 5, 6, None, 8, 9] test4['lst'] = lst assert(test4['lst'].has_null_mask) assert(test4['lst'].null_count == 2)
def test_dataframe_as_gpu_matrix_null_values(): df = DataFrame() nelem = 123 na = -10000 refvalues = {} for k in 'abcd': df[k] = data = np.random.random(nelem) bitmask = utils.random_bitmask(nelem) df[k] = df[k].set_mask(bitmask) boolmask = np.asarray(utils.expand_bits_to_bytes(bitmask)[:nelem], dtype=np.bool_) data[~boolmask] = na refvalues[k] = data # Check null value causes error with pytest.raises(ValueError) as raises: df.as_gpu_matrix() raises.match("column 'a' has null values") for k in df.columns: df[k] = df[k].fillna(na) mat = df.as_gpu_matrix().copy_to_host() for i, k in enumerate(df.columns): np.testing.assert_array_equal(refvalues[k], mat[:, i])
def test_label_encode_drop_one(): random.seed(0) np.random.seed(0) df = DataFrame() # initialize data frame df['cats'] = np.random.randint(7, size=10, dtype=np.int32) vals = list(df['cats'].unique()) # drop 1 randomly del vals[random.randrange(len(vals))] lab = dict(zip(vals, list(range(len(vals))))) # label encode series ncol = df['cats'].label_encoding(cats=vals, dtype='float32') arr = ncol.to_array() # verify labels of new column for i in range(arr.size): # assuming -1 is used for missing value np.testing.assert_equal(arr[i], lab.get(df.cats[i], -1)) # label encode data frame df2 = df.label_encoding(column='cats', prefix='cats', cats=vals, dtype='float32') assert df2.columns[0] == 'cats' assert df2.columns[1] == 'cats_labels'
def test_dataframe_empty_to_string(): # Test for printing empty dataframe df = DataFrame() got = df.to_string() print(got) expect = "Empty DataFrame\nColumns: []\nIndex: []\n" # values should match despite whitespace difference assert got.split() == expect.split()
def test_dataframe_empty_merge(): gdf1 = DataFrame([("a", []), ("b", [])]) gdf2 = DataFrame([("a", []), ("c", [])]) expect = DataFrame([("a", []), ("b", []), ("c", [])]) got = gdf1.merge(gdf2, how="left", on=["a"]) assert_eq(expect, got)
def test_dataframe_empty_merge(): gdf1 = DataFrame([('a', []), ('b', [])]) gdf2 = DataFrame([('a', []), ('c', [])]) expect = DataFrame([('a', []), ('b', []), ('c', [])]) got = gdf1.merge(gdf2, how='left', on=['a']) assert_eq(expect, got)
def test_merge_left_right_index_left_right_on_kwargs2(kwargs): left = pd.DataFrame({'x': [1, 2, 3]}, index=[10, 20, 30]) right = pd.DataFrame({'y': [10, 20, 30]}, index=[1, 2, 30]) gleft = DataFrame.from_pandas(left) gright = DataFrame.from_pandas(right) gd_merge = gleft.merge(gright, **kwargs) pd_merge = left.merge(right, **kwargs) if pd_merge.empty: assert (gd_merge.empty)
def test_merge_left_right_index_left_right_on_kwargs(kwargs): left = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 5, 6]) right = pd.DataFrame({'y': [10, 20, 30, 6, 5, 4]}, index=[1, 2, 3, 4, 5, 7]) gleft = DataFrame.from_pandas(left) gright = DataFrame.from_pandas(right) pd_merge = left.merge(right, **kwargs) gd_merge = gleft.merge(gright, **kwargs) assert_eq(pd_merge, gd_merge)
def test_onehot_get_dummies_simple(): df = DataFrame({'x': np.arange(10)}) original = df.copy() encoded = get_dummies(df, prefix='test') assert df == original # the original df should be unchanged cols = list(encoded.columns)[1:] actual = DataFrame(dict(zip(cols, np.eye(len(cols))))) assert (encoded.loc[:, cols] == actual).all().all()
def test_to_records_noindex(): df = DataFrame() df["a"] = aa = np.arange(10, dtype=np.int32) df["b"] = bb = np.arange(10, 20, dtype=np.float64) rec = df.to_records(index=False) assert rec.dtype.names == ("a", "b") np.testing.assert_array_equal(rec["a"], aa) np.testing.assert_array_equal(rec["b"], bb)
def test_dataframe_empty_concat(): gdf1 = DataFrame() gdf1['a'] = [] gdf1['b'] = [] gdf2 = gdf1.copy() gdf3 = gd.concat([gdf1, gdf2]) assert len(gdf3) == 0 assert len(gdf3.columns) == 2
def test_dataframe_append_to_empty(): pdf = pd.DataFrame() pdf['a'] = [] pdf['b'] = [1, 2, 3] gdf = DataFrame() gdf['a'] = [] gdf['b'] = [1, 2, 3] pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)
def test_dataframe_emptycolumns_to_string(): # Test for printing dataframe having empty columns df = DataFrame() df['a'] = [] df['b'] = [] got = df.to_string() print(got) expect = "Empty DataFrame\nColumns: ['a', 'b']\nIndex: []\n" # values should match despite whitespace difference assert got.split() == expect.split()
def test_groupby_apply_basic_agg_single_column(): gdf = DataFrame() gdf['key'] = [0, 0, 1, 1, 2, 2, 0] gdf['val'] = [0, 1, 2, 3, 4, 5, 6] gdf['mult'] = gdf['key'] * gdf['val'] pdf = gdf.to_pandas() gdg = gdf.groupby(['key', 'val']).mult.sum() pdg = pdf.groupby(['key', 'val']).mult.sum() assert_eq(pdg, gdg)
def test_groupby_apply_basic_agg_single_column(): gdf = DataFrame() gdf["key"] = [0, 0, 1, 1, 2, 2, 0] gdf["val"] = [0, 1, 2, 3, 4, 5, 6] gdf["mult"] = gdf["key"] * gdf["val"] pdf = gdf.to_pandas() gdg = gdf.groupby(["key", "val"]).mult.sum() pdg = pdf.groupby(["key", "val"]).mult.sum() assert_eq(pdg, gdg)
def test_merge_left_index_zero(): left = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]}, index=[0, 1, 2, 3, 4, 5]) right = pd.DataFrame({'y': [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6]) gleft = DataFrame.from_pandas(left) gright = DataFrame.from_pandas(right) pd_merge = left.merge(right, left_on="x", right_on='y') gd_merge = gleft.merge(gright, left_on="x", right_on='y') assert_eq(pd_merge, gd_merge)
def test_dataframe_join_cats(): lhs = DataFrame() lhs['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc')) lhs['b'] = bb = np.arange(len(lhs)) lhs = lhs.set_index('a') rhs = DataFrame() rhs['a'] = pd.Categorical(list('abcac'), categories=list('abc')) rhs['c'] = cc = np.arange(len(rhs)) rhs = rhs.set_index('a') got = lhs.join(rhs) expect = lhs.to_pandas().join(rhs.to_pandas()) # Note: pandas make a object Index after joining pd.util.testing.assert_frame_equal( got.sort_values(by='b').to_pandas().sort_index().reset_index( drop=True), expect.reset_index(drop=True)) # Just do some rough checking here. assert list(got.columns) == ['b', 'c'] assert len(got) > 0 assert set(got.index.values) & set('abc') assert set(got['b']) & set(bb) assert set(got['c']) & set(cc)
def test_dataframe_sort_values(nelem, dtype): np.random.seed(0) df = DataFrame() df["a"] = aa = (100 * np.random.random(nelem)).astype(dtype) df["b"] = bb = (100 * np.random.random(nelem)).astype(dtype) sorted_df = df.sort_values(by="a") # Check sorted_index = np.argsort(aa, kind="mergesort") np.testing.assert_array_equal(sorted_df.index.values, sorted_index) np.testing.assert_array_equal(sorted_df["a"], aa[sorted_index]) np.testing.assert_array_equal(sorted_df["b"], bb[sorted_index])
def test_to_records_withindex(): df = DataFrame() df['a'] = aa = np.arange(10, dtype=np.int32) df['b'] = bb = np.arange(10, 20, dtype=np.float64) rec_indexed = df.to_records(index=True) assert rec_indexed.size == len(aa) assert rec_indexed.dtype.names == ('index', 'a', 'b') np.testing.assert_array_equal(rec_indexed['a'], aa) np.testing.assert_array_equal(rec_indexed['b'], bb) np.testing.assert_array_equal(rec_indexed['index'], np.arange(10))
def test_to_records_withindex(): df = DataFrame() df["a"] = aa = np.arange(10, dtype=np.int32) df["b"] = bb = np.arange(10, 20, dtype=np.float64) rec_indexed = df.to_records(index=True) assert rec_indexed.size == len(aa) assert rec_indexed.dtype.names == ("index", "a", "b") np.testing.assert_array_equal(rec_indexed["a"], aa) np.testing.assert_array_equal(rec_indexed["b"], bb) np.testing.assert_array_equal(rec_indexed["index"], np.arange(10))
def test_reading_arrow_sparse_data(): schema, darr = read_data() gar = GpuArrowReader(schema, darr) df = DataFrame(gar.to_dict().items()) # preprocessing num_cols = set() cat_cols = set() response_set = set(['INCEARN ']) feature_names = set(df.columns) - response_set # Determine cat and numeric columns uniques = {} for k in feature_names: try: uniquevals = df[k].unique() uniques[k] = uniquevals except ValueError: num_cols.add(k) else: nunique = len(uniquevals) if nunique < 2: del df[k] elif 1 < nunique < 1000: cat_cols.add(k) else: num_cols.add(k) # Fix numeric columns for k in (num_cols - response_set): df[k] = df[k].fillna(df[k].mean()) assert df[k].null_count == 0 std = df[k].std() # drop near constant columns if not np.isfinite(std) or std < 1e-4: del df[k] print('drop near constant', k) else: df[k] = df[k].scale() # Expand categorical columns for k in cat_cols: cats = uniques[k][1:] # drop first df = df.one_hot_encoding(k, prefix=k, cats=cats) del df[k] # Print dtypes assert {df[k].dtype for k in df.columns} == {np.dtype('float64')} mat = df.as_matrix() assert mat.max() == 1 assert mat.min() == 0
def test_df_cat_sort_index(): df = DataFrame() df['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc')) df['b'] = np.arange(len(df)) got = df.set_index('a').sort_index() expect = df.to_pandas().set_index('a').sort_index() assert list(expect.columns) == list(got.columns) assert list(expect.index.values) == list(got.index.values) np.testing.assert_array_equal(expect.index.values, got.index.values) np.testing.assert_array_equal(expect['b'].values, got['b'].to_array())