def test_dataframe_to_string(): with set_options(formatting={'nrows': 5, 'ncols': 8}): # Test basic df = DataFrame([('a', [1, 2, 3, 4, 5, 6]), ('b', [11, 12, 13, 14, 15, 16])]) string = str(df) print(string) assert string.splitlines()[-1] == '[1 more rows]' # Test skipped columns df = DataFrame([('a', [1, 2, 3, 4, 5, 6]), ('b', [11, 12, 13, 14, 15, 16]), ('c', [11, 12, 13, 14, 15, 16]), ('d', [11, 12, 13, 14, 15, 16])]) string = df.to_string(ncols=3) print(string) assert string.splitlines()[-2] == '[1 more rows]' assert string.splitlines()[-1] == '[1 more columns]' # Test masked df = DataFrame([('a', [1, 2, 3, 4, 5, 6]), ('b', [11, 12, 13, 14, 15, 16])]) data = np.arange(6) mask = np.zeros(1, dtype=np.uint8) mask[0] = 0b00101101 masked = Series.from_masked_array(data, mask) assert masked.null_count == 2 df['c'] = masked # check data values = list(masked) validids = [0, 2, 3, 5] densearray = masked.to_array() np.testing.assert_equal(data[validids], densearray) # valid position is corret for i in validids: assert data[i] == values[i] # null position is correct for i in range(len(values)): if i not in validids: assert values[i] is None got = df.to_string(nrows=None) print(got) expect = ''' a b c 0 1 11 0 1 2 12 2 3 13 2 3 4 14 3 4 5 15 5 6 16 5 ''' # values should match despite whitespace difference assert got.split() == expect.split()
def test_categorical_value_counts(num_elements): from string import ascii_letters, digits # create categorical series np.random.seed(12) pd_cat = pd.Categorical( pd.Series( np.random.choice(list(ascii_letters + digits), num_elements), dtype='category' ) ) # gdf gdf = DataFrame() gdf['a'] = Series.from_categorical(pd_cat) gdf_value_counts = gdf['a'].value_counts() # pandas pdf = pd.DataFrame() pdf['a'] = pd_cat pdf_value_counts = pdf['a'].value_counts() # verify pandas_dict = pdf_value_counts.to_dict() gdf_dict = gdf_value_counts.to_pandas().to_dict() assert pandas_dict == gdf_dict
def test_dataframe_hash_partition(nrows, nparts, nkeys): np.random.seed(123) gdf = DataFrame() keycols = [] for i in range(nkeys): keyname = 'key{}'.format(i) gdf[keyname] = np.random.randint(0, 7 - i, nrows) keycols.append(keyname) gdf['val1'] = np.random.randint(0, nrows * 2, nrows) got = gdf.partition_by_hash(keycols, nparts=nparts) # Must return a list assert isinstance(got, list) # Must have correct number of partitions assert len(got) == nparts # All partitions must be DataFrame type assert all(isinstance(p, DataFrame) for p in got) # Check that all partitions have unique keys part_unique_keys = set() for p in got: if len(p): # Take rows of the keycolums and build a set of the key-values unique_keys = set(map(tuple, p.as_matrix(columns=keycols))) # Ensure that none of the key-values have occurred in other groups assert not (unique_keys & part_unique_keys) part_unique_keys |= unique_keys assert len(part_unique_keys)
def test_dataframe_as_gpu_matrix_null_values(): df = DataFrame() nelem = 123 na = -10000 refvalues = {} for k in 'abcd': df[k] = data = np.random.random(nelem) bitmask = utils.random_bitmask(nelem) df[k] = df[k].set_mask(bitmask) boolmask = np.asarray(utils.expand_bits_to_bytes(bitmask)[:nelem], dtype=np.bool_) data[~boolmask] = na refvalues[k] = data # Check null value causes error with pytest.raises(ValueError) as raises: df.as_gpu_matrix() raises.match("column 'a' has null values") for k in df.columns: df[k] = df[k].fillna(na) mat = df.as_gpu_matrix().copy_to_host() for i, k in enumerate(df.columns): np.testing.assert_array_equal(refvalues[k], mat[:, i])
def test_dataframe_setitem_from_masked_object(): ary = np.random.randn(100) mask = np.zeros(100, dtype=bool) mask[:20] = True np.random.shuffle(mask) ary[mask] = np.nan test1 = Series(ary) assert (test1.has_null_mask) assert (test1.null_count == 20) test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary})) assert (test2['a'].has_null_mask) assert (test2['a'].null_count == 20) gpu_ary = cuda.to_device(ary) test3 = Series(gpu_ary) assert (test3.has_null_mask) assert (test3.null_count == 20) test4 = DataFrame() lst = [1, 2, None, 4, 5, 6, None, 8, 9] test4['lst'] = lst assert (test4['lst'].has_null_mask) assert (test4['lst'].null_count == 2)
def test_dataframe_loc(): df = DataFrame() size = 123 df['a'] = ha = np.random.randint(low=0, high=100, size=size).astype(np.int32) df['b'] = hb = np.random.random(size).astype(np.float32) df['c'] = hc = np.random.randint(low=0, high=100, size=size).astype(np.int64) df['d'] = hd = np.random.random(size).astype(np.float64) begin = 117 end = 122 fewer = df.loc[begin:end, ['c', 'd', 'a']] assert len(fewer) == end - begin + 1 assert fewer.columns == tuple(['c', 'd', 'a']) np.testing.assert_equal(fewer['a'].to_array(), ha[begin:end + 1]) np.testing.assert_equal(fewer['c'].to_array(), hc[begin:end + 1]) np.testing.assert_equal(fewer['d'].to_array(), hd[begin:end + 1]) del fewer # Make int64 index offset = 50 df2 = df[offset:] begin = 117 end = 122 fewer = df2.loc[begin:end, ['c', 'd', 'a']] assert len(fewer) == end - begin + 1 assert fewer.columns == tuple(['c', 'd', 'a']) np.testing.assert_equal(fewer['a'].to_array(), ha[begin:end + 1]) np.testing.assert_equal(fewer['c'].to_array(), hc[begin:end + 1]) np.testing.assert_equal(fewer['d'].to_array(), hd[begin:end + 1])
def test_label_encode_drop_one(): random.seed(0) np.random.seed(0) df = DataFrame() # initialize data frame df['cats'] = np.random.randint(7, size=10, dtype=np.int32) vals = list(df['cats'].unique()) # drop 1 randomly del vals[random.randrange(len(vals))] lab = dict(zip(vals, list(range(len(vals))))) # label encode series ncol = df['cats'].label_encoding(cats=vals, dtype='float32') arr = ncol.to_array() # verify labels of new column for i in range(arr.size): # assuming -1 is used for missing value np.testing.assert_equal(arr[i], lab.get(df.cats[i], -1)) # label encode data frame df2 = df.label_encoding(column='cats', prefix='cats', cats=vals, dtype='float32') assert df2.columns[0] == 'cats' assert df2.columns[1] == 'cats_labels'
def test_dataframe_join_suffix(): np.random.seed(0) df = DataFrame() for k in 'abc': df[k] = np.random.randint(0, 5, 5) left = df.set_index('a') right = df.set_index('c') with pytest.raises(ValueError) as raises: left.join(right) raises.match("there are overlapping columns but lsuffix" " and rsuffix are not defined") got = left.join(right, lsuffix='_left', rsuffix='_right', sort=True) # Get expected value pddf = df.to_pandas() expect = pddf.set_index('a').join(pddf.set_index('c'), lsuffix='_left', rsuffix='_right') # Check assert list(expect.columns) == list(got.columns) assert np.all(expect.index.values == got.index.values) for k in expect.columns: _check_series(expect[k], got[k])
def test_dataframe_join_how(aa, bb, how, method): df = DataFrame() df['a'] = aa df['b'] = bb def work_pandas(df): ts = timer() df1 = df.set_index('a') df2 = df.set_index('b') joined = df1.join(df2, how=how, sort=True) te = timer() print('timing', type(df), te - ts) return joined def work_gdf(df): ts = timer() df1 = df.set_index('a') df2 = df.set_index('b') joined = df1.join(df2, how=how, sort=True, method=method) te = timer() print('timing', type(df), te - ts) return joined expect = work_pandas(df.to_pandas()) got = work_gdf(df) expecto = expect.copy() goto = got.copy() # Type conversion to handle NoneType expectb = expect.b expecta = expect.a gotb = got.b gota = got.a got.drop_column('b') got.add_column('b', gotb.astype(np.float64).fillna(np.nan)) got.drop_column('a') got.add_column('a', gota.astype(np.float64).fillna(np.nan)) expect.drop(['b'], axis=1) expect['b'] = expectb.astype(np.float64).fillna(np.nan) expect.drop(['a'], axis=1) expect['a'] = expecta.astype(np.float64).fillna(np.nan) # print(expect) # print(got.to_string(nrows=None)) assert list(expect.columns) == list(got.columns) assert np.all(expect.index.values == got.index.values) if(how != 'outer'): pd.util.testing.assert_frame_equal( got.to_pandas().sort_values(['b', 'a']).reset_index(drop=True), expect.sort_values(['b', 'a']).reset_index(drop=True)) # if(how=='right'): # _sorted_check_series(expect['a'], expect['b'], # got['a'], got['b']) # else: # _sorted_check_series(expect['b'], expect['a'], got['b'], # got['a']) else: _check_series(expecto['b'], goto['b']) _check_series(expecto['a'], goto['a'])
def test_assign(): gdf = DataFrame({'x': [1, 2, 3]}) gdf2 = gdf.assign(y=gdf.x + 1) assert list(gdf.columns) == ['x'] assert list(gdf2.columns) == ['x', 'y'] np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4])
def test_pickle_dataframe_numeric(): np.random.seed(0) df = DataFrame() nelem = 10 df['keys'] = np.arange(nelem, dtype=np.float64) df['vals'] = np.random.random(nelem) check_serialization(df)
def test_dataframe_astype(): df = DataFrame() data = np.asarray(range(10), dtype=np.int32) df['a'] = data assert df['a'].dtype == np.dtype(np.int32) df['b'] = df['a'].astype(np.float32) assert df['b'].dtype == np.dtype(np.float32) np.testing.assert_equal(df['a'].to_array(), df['b'].to_array())
def test_pickle_dataframe_categorical(): np.random.seed(0) df = DataFrame() df['keys'] = pd.Categorical("aaabababac") df['vals'] = np.random.random(len(df)) check_serialization(df)
def test_to_records_noindex(): df = DataFrame() df['a'] = aa = np.arange(10, dtype=np.int32) df['b'] = bb = np.arange(10, 20, dtype=np.float64) rec = df.to_records(index=False) assert rec.dtype.names == ('a', 'b') np.testing.assert_array_equal(rec['a'], aa) np.testing.assert_array_equal(rec['b'], bb)
def test_dataframe_empty_concat(): gdf1 = DataFrame() gdf1['a'] = [] gdf1['b'] = [] gdf2 = gdf1.copy() gdf3 = gd.concat([gdf1, gdf2]) assert len(gdf3) == 0 assert len(gdf3.columns) == 2
def test_dataframe_append_to_empty(): pdf = pd.DataFrame() pdf['a'] = [] pdf['b'] = [1, 2, 3] gdf = DataFrame() gdf['a'] = [] gdf['b'] = [1, 2, 3] pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)
def test_dataframe_join_cats(): ldf = DataFrame() ldf['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc')) ldf['b'] = bb = np.arange(len(ldf)) lhs = ldf.set_index('a') rdf = DataFrame() rdf['a'] = pd.Categorical(list('abcac'), categories=list('abc')) rdf['c'] = cc = np.arange(len(rdf)) rhs = rdf.set_index('a') got = lhs.join(rhs) # Just do some rough checking here. # Note: pandas fails to join on categorical index. assert list(got.columns) == ['b', 'c'] assert len(got) > 0 assert set(got.index.values) & set('abc') assert set(got['b']) & set(bb) assert set(got['c']) & set(cc)
def test_dataframe_multi_column_join(): np.random.seed(0) # Make GDF df_left = DataFrame() nelem = 500 df_left['key1'] = np.random.randint(0, 30, nelem) df_left['key2'] = np.random.randint(0, 50, nelem) df_left['val1'] = np.arange(nelem) df_right = DataFrame() nelem = 500 df_right['key1'] = np.random.randint(0, 30, nelem) df_right['key2'] = np.random.randint(0, 50, nelem) df_right['val1'] = np.arange(nelem) # Make pandas DF pddf_left = df_left.to_pandas() pddf_right = df_right.to_pandas() # print(pddf_left) # print(pddf_right) # Expected result pddf_joined = pddf_left.merge(pddf_right, on=['key1', 'key2'], how='left', sort=True) # print(pddf_joined) # Test (doesn't check for ordering) join_result = df_left.merge(df_right, on=['key1', 'key2'], how='left') for col in list(pddf_joined.columns): if(col.count('_y') > 0): join_result[col] = (join_result[col] .astype(np.float64) .fillna(np.nan)) pd.util.testing.assert_frame_equal( join_result .to_pandas() .sort_values(list(pddf_joined.columns)) .reset_index(drop=True), pddf_joined)
def test_to_records_withindex(): df = DataFrame() df['a'] = aa = np.arange(10, dtype=np.int32) df['b'] = bb = np.arange(10, 20, dtype=np.float64) rec_indexed = df.to_records(index=True) assert rec_indexed.size == len(aa) assert rec_indexed.dtype.names == ('index', 'a', 'b') np.testing.assert_array_equal(rec_indexed['a'], aa) np.testing.assert_array_equal(rec_indexed['b'], bb) np.testing.assert_array_equal(rec_indexed['index'], np.arange(10))
def test_dataframe_sort_values(nelem, dtype): np.random.seed(0) df = DataFrame() df['a'] = aa = (100 * np.random.random(nelem)).astype(dtype) df['b'] = bb = (100 * np.random.random(nelem)).astype(dtype) sorted_df = df.sort_values(by='a') # Check sorted_index = np.argsort(aa, kind='mergesort') np.testing.assert_array_equal(sorted_df.index.values, sorted_index) np.testing.assert_array_equal(sorted_df['a'], aa[sorted_index]) np.testing.assert_array_equal(sorted_df['b'], bb[sorted_index])
def test_reading_arrow_sparse_data(): schema, darr = read_data() gar = GpuArrowReader(schema, darr) df = DataFrame(gar.to_dict().items()) # preprocessing num_cols = set() cat_cols = set() response_set = set(['INCEARN ']) feature_names = set(df.columns) - response_set # Determine cat and numeric columns uniques = {} for k in feature_names: try: uniquevals = df[k].unique() uniques[k] = uniquevals except ValueError: num_cols.add(k) else: nunique = len(uniquevals) if nunique < 2: del df[k] elif 1 < nunique < 1000: cat_cols.add(k) else: num_cols.add(k) # Fix numeric columns for k in (num_cols - response_set): df[k] = df[k].fillna(df[k].mean()) assert df[k].null_count == 0 std = df[k].std() # drop near constant columns if not np.isfinite(std) or std < 1e-4: del df[k] print('drop near constant', k) else: df[k] = df[k].scale() # Expand categorical columns for k in cat_cols: cats = uniques[k][1:] # drop first df = df.one_hot_encoding(k, prefix=k, cats=cats) del df[k] # Print dtypes assert {df[k].dtype for k in df.columns} == {np.dtype('float64')} mat = df.as_matrix() assert mat.max() == 1 assert mat.min() == 0
def test_df_cat_sort_index(): df = DataFrame() df['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc')) df['b'] = np.arange(len(df)) got = df.set_index('a').sort_index() expect = df.to_pandas().set_index('a').sort_index() assert list(expect.columns) == list(got.columns) assert list(expect.index.values) == list(got.index.values) np.testing.assert_array_equal(expect.index.values, got.index.values) np.testing.assert_array_equal(expect['b'].values, got['b'].to_array())
def test_dataframe_nsmallest(nelem, n): np.random.seed(0) df = DataFrame() df['a'] = aa = np.random.random(nelem) df['b'] = bb = np.random.random(nelem) res = df.nsmallest(n, 'a') # Check inds = np.argsort(-aa) np.testing.assert_array_equal(res['a'].to_array(), aa[inds][-n:][::-1]) np.testing.assert_array_equal(res['b'].to_array(), bb[inds][-n:][::-1]) np.testing.assert_array_equal(res.index.values, inds[-n:][::-1])
def test_query_env_changing(): df = DataFrame() df['a'] = aa = np.arange(100) expr = 'a < @c' # first attempt c = 10 got = df.query(expr) np.testing.assert_array_equal(aa[aa < c], got['a'].to_array()) # change env c = 50 got = df.query(expr) np.testing.assert_array_equal(aa[aa < c], got['a'].to_array())
def test_df_set_index_from_series(): df = DataFrame() df['a'] = list(range(10)) df['b'] = list(range(0, 20, 2)) # Check set_index(Series) df2 = df.set_index(df['b']) assert list(df2.columns) == ['a', 'b'] sliced_strided = df2.loc[2:6] print(sliced_strided) assert len(sliced_strided) == 3 assert list(sliced_strided.index.values) == [2, 4, 6]
def test_dataframe_column_add_drop(): df = DataFrame() data = np.asarray(range(10)) df['a'] = data df['b'] = data assert df.columns == ('a', 'b') del df['a'] assert df.columns == ('b',) df['c'] = data assert df.columns == ('b', 'c') df['a'] = data assert df.columns == ('b', 'c', 'a')
def test_dataframe_dir_and_getattr(): df = DataFrame([('a', np.ones(10)), ('b', np.ones(10)), ('not an id', np.ones(10)), ('oop$', np.ones(10))]) o = dir(df) assert {'a', 'b'}.issubset(o) assert 'not an id' not in o assert 'oop$' not in o # Getattr works assert df.a is df['a'] assert df.b is df['b'] with pytest.raises(AttributeError): df.not_a_column
def test_nonmatching_index_setitem(nrows): np.random.seed(0) gdf = DataFrame() gdf['a'] = np.random.randint(2147483647, size=nrows) gdf['b'] = np.random.randint(2147483647, size=nrows) gdf = gdf.set_index('b') test_values = np.random.randint(2147483647, size=nrows) gdf['c'] = test_values assert (len(test_values) == len(gdf['c'])) assert (gdf['c'].to_pandas().equals( Series(test_values).set_index(gdf._index).to_pandas()))
def test_dataframe_basic(): np.random.seed(0) df = DataFrame() # Populate with cuda memory df['keys'] = cuda.to_device(np.arange(10, dtype=np.float64)) np.testing.assert_equal(df['keys'].to_array(), np.arange(10)) assert len(df) == 10 # Populate with numpy array rnd_vals = np.random.random(10) df['vals'] = rnd_vals np.testing.assert_equal(df['vals'].to_array(), rnd_vals) assert len(df) == 10 assert df.columns == ('keys', 'vals') # Make another dataframe df2 = DataFrame() df2['keys'] = np.array([123], dtype=np.float64) df2['vals'] = np.array([321], dtype=np.float64) # Concat df = df.concat(df2) assert len(df) == 11 hkeys = np.asarray(np.arange(10, dtype=np.float64).tolist() + [123]) hvals = np.asarray(rnd_vals.tolist() + [321]) np.testing.assert_equal(df['keys'].to_array(), hkeys) np.testing.assert_equal(df['vals'].to_array(), hvals) # As matrix mat = df.as_matrix() expect = np.vstack([hkeys, hvals]).T print(expect) print(mat) np.testing.assert_equal(mat, expect)
def test_sizeof_dataframe(): np.random.seed(0) df = DataFrame() nelem = 1000 df['keys'] = hkeys = np.arange(nelem, dtype=np.float64) df['vals'] = hvals = np.random.random(nelem) nbytes = hkeys.nbytes + hvals.nbytes sizeof = sys.getsizeof(df) assert sizeof >= nbytes serialized_nbytes = len(pickle.dumps(df, protocol=pickle.HIGHEST_PROTOCOL)) # Serialized size should be close to what __sizeof__ is giving np.testing.assert_approx_equal(sizeof, serialized_nbytes, significant=2)