def test_reading_arrow_sparse_data(): schema, darr = read_data() gar = GpuArrowReader(schema, darr) df = DataFrame(gar.to_dict().items()) # preprocessing num_cols = set() cat_cols = set() response_set = set(['INCEARN ']) feature_names = set(df.columns) - response_set # Determine cat and numeric columns uniques = {} for k in feature_names: try: uniquevals = df[k].unique() uniques[k] = uniquevals except ValueError: num_cols.add(k) else: nunique = len(uniquevals) if nunique < 2: del df[k] elif 1 < nunique < 1000: cat_cols.add(k) else: num_cols.add(k) # Fix numeric columns for k in (num_cols - response_set): df[k] = df[k].fillna(df[k].mean()) assert df[k].null_count == 0 std = df[k].std() # drop near constant columns if not np.isfinite(std) or std < 1e-4: del df[k] print('drop near constant', k) else: df[k] = df[k].scale() # Expand categorical columns for k in cat_cols: cats = uniques[k][1:] # drop first df = df.one_hot_encoding(k, prefix=k, cats=cats) del df[k] # Print dtypes assert {df[k].dtype for k in df.columns} == {np.dtype('float64')} mat = df.as_matrix() assert mat.max() == 1 assert mat.min() == 0
def test_dataframe_basic(): np.random.seed(0) df = DataFrame() # Populate with cuda memory df['keys'] = rmm.to_device(np.arange(10, dtype=np.float64)) np.testing.assert_equal(df['keys'].to_array(), np.arange(10)) assert len(df) == 10 # Populate with numpy array rnd_vals = np.random.random(10) df['vals'] = rnd_vals np.testing.assert_equal(df['vals'].to_array(), rnd_vals) assert len(df) == 10 assert tuple(df.columns) == ('keys', 'vals') # Make another dataframe df2 = DataFrame() df2['keys'] = np.array([123], dtype=np.float64) df2['vals'] = np.array([321], dtype=np.float64) # Concat df = gd.concat([df, df2]) assert len(df) == 11 hkeys = np.asarray(np.arange(10, dtype=np.float64).tolist() + [123]) hvals = np.asarray(rnd_vals.tolist() + [321]) np.testing.assert_equal(df['keys'].to_array(), hkeys) np.testing.assert_equal(df['vals'].to_array(), hvals) # As matrix mat = df.as_matrix() expect = np.vstack([hkeys, hvals]).T print(expect) print(mat) np.testing.assert_equal(mat, expect)