def test_dataframe_join_suffix(): np.random.seed(0) df = DataFrame() for k in 'abc': df[k] = np.random.randint(0, 5, 5) left = df.set_index('a') right = df.set_index('c') with pytest.raises(ValueError) as raises: left.join(right) raises.match("there are overlapping columns but lsuffix" " and rsuffix are not defined") got = left.join(right, lsuffix='_left', rsuffix='_right', sort=True) # Get expected value pddf = df.to_pandas() expect = pddf.set_index('a').join(pddf.set_index('c'), lsuffix='_left', rsuffix='_right') # Check assert list(expect.columns) == list(got.columns) assert np.all(expect.index.values == got.index.values) for k in expect.columns: _check_series(expect[k], got[k])
def test_dataframe_join_mismatch_cats(how): pdf1 = pd.DataFrame({"join_col": ["a", "b", "c", "d", "e"], "data_col_left": [10, 20, 30, 40, 50]}) pdf2 = pd.DataFrame({"join_col": ["c", "e", "f"], "data_col_right": [6, 7, 8]}) pdf1["join_col"] = pdf1["join_col"].astype("category") pdf2["join_col"] = pdf2["join_col"].astype("category") gdf1 = DataFrame.from_pandas(pdf1) gdf2 = DataFrame.from_pandas(pdf2) gdf1 = gdf1.set_index("join_col") gdf2 = gdf2.set_index("join_col") pdf1 = pdf1.set_index('join_col') pdf2 = pdf2.set_index('join_col') join_gdf = gdf1.join(gdf2, how=how, sort=True) join_pdf = pdf1.join(pdf2, how=how) got = join_gdf.to_pandas() expect = join_pdf.fillna(-1) # note: pygdf join doesn't mask NA expect.data_col_right = expect.data_col_right.astype(np.int64) expect.data_col_left = expect.data_col_left.astype(np.int64) pd.util.testing.assert_frame_equal(got, expect, check_names=False, check_index_type=False, # For inner joins, pandas returns # weird categories. check_categorical=how != 'inner') assert list(got.index) == list(expect.index)
def test_assign(): gdf = DataFrame({'x': [1, 2, 3]}) gdf2 = gdf.assign(y=gdf.x + 1) assert list(gdf.columns) == ['x'] assert list(gdf2.columns) == ['x', 'y'] np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4])
def test_dataframe_as_gpu_matrix_null_values(): df = DataFrame() nelem = 123 na = -10000 refvalues = {} for k in 'abcd': df[k] = data = np.random.random(nelem) bitmask = utils.random_bitmask(nelem) df[k] = df[k].set_mask(bitmask) boolmask = np.asarray(utils.expand_bits_to_bytes(bitmask)[:nelem], dtype=np.bool_) data[~boolmask] = na refvalues[k] = data # Check null value causes error with pytest.raises(ValueError) as raises: df.as_gpu_matrix() raises.match("column 'a' has null values") for k in df.columns: df[k] = df[k].fillna(na) mat = df.as_gpu_matrix().copy_to_host() for i, k in enumerate(df.columns): np.testing.assert_array_equal(refvalues[k], mat[:, i])
def test_dataframe_setitem_from_masked_object(): ary = np.random.randn(100) mask = np.zeros(100, dtype=bool) mask[:20] = True np.random.shuffle(mask) ary[mask] = np.nan test1 = Series(ary) assert (test1.has_null_mask) assert (test1.null_count == 20) test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary})) assert (test2['a'].has_null_mask) assert (test2['a'].null_count == 20) gpu_ary = cuda.to_device(ary) test3 = Series(gpu_ary) assert (test3.has_null_mask) assert (test3.null_count == 20) test4 = DataFrame() lst = [1, 2, None, 4, 5, 6, None, 8, 9] test4['lst'] = lst assert (test4['lst'].has_null_mask) assert (test4['lst'].null_count == 2)
def test_dataframe_hash_partition(nrows, nparts, nkeys): np.random.seed(123) gdf = DataFrame() keycols = [] for i in range(nkeys): keyname = 'key{}'.format(i) gdf[keyname] = np.random.randint(0, 7 - i, nrows) keycols.append(keyname) gdf['val1'] = np.random.randint(0, nrows * 2, nrows) got = gdf.partition_by_hash(keycols, nparts=nparts) # Must return a list assert isinstance(got, list) # Must have correct number of partitions assert len(got) == nparts # All partitions must be DataFrame type assert all(isinstance(p, DataFrame) for p in got) # Check that all partitions have unique keys part_unique_keys = set() for p in got: if len(p): # Take rows of the keycolums and build a set of the key-values unique_keys = set(map(tuple, p.as_matrix(columns=keycols))) # Ensure that none of the key-values have occurred in other groups assert not (unique_keys & part_unique_keys) part_unique_keys |= unique_keys assert len(part_unique_keys)
def test_label_encode_drop_one(): random.seed(0) np.random.seed(0) df = DataFrame() # initialize data frame df['cats'] = np.random.randint(7, size=10, dtype=np.int32) vals = list(df['cats'].unique()) # drop 1 randomly del vals[random.randrange(len(vals))] lab = dict(zip(vals, list(range(len(vals))))) # label encode series ncol = df['cats'].label_encoding(cats=vals, dtype='float32') arr = ncol.to_array() # verify labels of new column for i in range(arr.size): # assuming -1 is used for missing value np.testing.assert_equal(arr[i], lab.get(df.cats[i], -1)) # label encode data frame df2 = df.label_encoding(column='cats', prefix='cats', cats=vals, dtype='float32') assert df2.columns[0] == 'cats' assert df2.columns[1] == 'cats_labels'
def test_dataframe_join_how(aa, bb, how, method): df = DataFrame() df['a'] = aa df['b'] = bb def work_pandas(df): ts = timer() df1 = df.set_index('a') df2 = df.set_index('b') joined = df1.join(df2, how=how, sort=True) te = timer() print('timing', type(df), te - ts) return joined def work_gdf(df): ts = timer() df1 = df.set_index('a') df2 = df.set_index('b') joined = df1.join(df2, how=how, sort=True, method=method) te = timer() print('timing', type(df), te - ts) return joined expect = work_pandas(df.to_pandas()) got = work_gdf(df) expecto = expect.copy() goto = got.copy() # Type conversion to handle NoneType expectb = expect.b expecta = expect.a gotb = got.b gota = got.a got.drop_column('b') got.add_column('b', gotb.astype(np.float64).fillna(np.nan)) got.drop_column('a') got.add_column('a', gota.astype(np.float64).fillna(np.nan)) expect.drop(['b'], axis=1) expect['b'] = expectb.astype(np.float64).fillna(np.nan) expect.drop(['a'], axis=1) expect['a'] = expecta.astype(np.float64).fillna(np.nan) # print(expect) # print(got.to_string(nrows=None)) assert list(expect.columns) == list(got.columns) assert np.all(expect.index.values == got.index.values) if(how != 'outer'): pd.util.testing.assert_frame_equal( got.to_pandas().sort_values(['b', 'a']).reset_index(drop=True), expect.sort_values(['b', 'a']).reset_index(drop=True)) # if(how=='right'): # _sorted_check_series(expect['a'], expect['b'], # got['a'], got['b']) # else: # _sorted_check_series(expect['b'], expect['a'], got['b'], # got['a']) else: _check_series(expecto['b'], goto['b']) _check_series(expecto['a'], goto['a'])
def test_to_records_noindex(): df = DataFrame() df['a'] = aa = np.arange(10, dtype=np.int32) df['b'] = bb = np.arange(10, 20, dtype=np.float64) rec = df.to_records(index=False) assert rec.dtype.names == ('a', 'b') np.testing.assert_array_equal(rec['a'], aa) np.testing.assert_array_equal(rec['b'], bb)
def test_dataframe_append_to_empty(): pdf = pd.DataFrame() pdf['a'] = [] pdf['b'] = [1, 2, 3] gdf = DataFrame() gdf['a'] = [] gdf['b'] = [1, 2, 3] pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)
def test_dataframe_empty_concat(): gdf1 = DataFrame() gdf1['a'] = [] gdf1['b'] = [] gdf2 = gdf1.copy() gdf3 = gd.concat([gdf1, gdf2]) assert len(gdf3) == 0 assert len(gdf3.columns) == 2
def test_dataframe_join_cats(): lhs = DataFrame() lhs['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc')) lhs['b'] = bb = np.arange(len(lhs)) lhs = lhs.set_index('a') rhs = DataFrame() rhs['a'] = pd.Categorical(list('abcac'), categories=list('abc')) rhs['c'] = cc = np.arange(len(rhs)) rhs = rhs.set_index('a') got = lhs.join(rhs) expect = lhs.to_pandas().join(rhs.to_pandas()) # Note: pandas make a object Index after joining pd.util.testing.assert_frame_equal( got.sort_values(by='b') .to_pandas() .sort_index() .reset_index(drop=True), expect.reset_index(drop=True)) # Just do some rough checking here. assert list(got.columns) == ['b', 'c'] assert len(got) > 0 assert set(got.index.values) & set('abc') assert set(got['b']) & set(bb) assert set(got['c']) & set(cc)
def test_to_records_withindex(): df = DataFrame() df['a'] = aa = np.arange(10, dtype=np.int32) df['b'] = bb = np.arange(10, 20, dtype=np.float64) rec_indexed = df.to_records(index=True) assert rec_indexed.size == len(aa) assert rec_indexed.dtype.names == ('index', 'a', 'b') np.testing.assert_array_equal(rec_indexed['a'], aa) np.testing.assert_array_equal(rec_indexed['b'], bb) np.testing.assert_array_equal(rec_indexed['index'], np.arange(10))
def test_dataframe_sort_values(nelem, dtype): np.random.seed(0) df = DataFrame() df['a'] = aa = (100 * np.random.random(nelem)).astype(dtype) df['b'] = bb = (100 * np.random.random(nelem)).astype(dtype) sorted_df = df.sort_values(by='a') # Check sorted_index = np.argsort(aa, kind='mergesort') np.testing.assert_array_equal(sorted_df.index.values, sorted_index) np.testing.assert_array_equal(sorted_df['a'], aa[sorted_index]) np.testing.assert_array_equal(sorted_df['b'], bb[sorted_index])
def test_reading_arrow_sparse_data(): schema, darr = read_data() gar = GpuArrowReader(schema, darr) df = DataFrame(gar.to_dict().items()) # preprocessing num_cols = set() cat_cols = set() response_set = set(['INCEARN ']) feature_names = set(df.columns) - response_set # Determine cat and numeric columns uniques = {} for k in feature_names: try: uniquevals = df[k].unique() uniques[k] = uniquevals except ValueError: num_cols.add(k) else: nunique = len(uniquevals) if nunique < 2: del df[k] elif 1 < nunique < 1000: cat_cols.add(k) else: num_cols.add(k) # Fix numeric columns for k in (num_cols - response_set): df[k] = df[k].fillna(df[k].mean()) assert df[k].null_count == 0 std = df[k].std() # drop near constant columns if not np.isfinite(std) or std < 1e-4: del df[k] print('drop near constant', k) else: df[k] = df[k].scale() # Expand categorical columns for k in cat_cols: cats = uniques[k][1:] # drop first df = df.one_hot_encoding(k, prefix=k, cats=cats) del df[k] # Print dtypes assert {df[k].dtype for k in df.columns} == {np.dtype('float64')} mat = df.as_matrix() assert mat.max() == 1 assert mat.min() == 0
def test_reading_arrow_sparse_data(): schema, darr = read_data() gar = GpuArrowReader(schema, darr) df = DataFrame(gar.to_dict().items()) # preprocessing num_cols = set() cat_cols = set() response_set = set(['INCEARN ']) feature_names = set(df.columns) - response_set # Determine cat and numeric columns uniques = {} for k in feature_names: try: uniquevals = df[k].unique_k(k=1000) uniques[k] = uniquevals except ValueError: num_cols.add(k) else: nunique = len(uniquevals) if nunique < 2: del df[k] elif 1 < nunique < 1000: cat_cols.add(k) else: num_cols.add(k) # Fix numeric columns for k in (num_cols - response_set): df[k] = df[k].fillna(df[k].mean()) assert df[k].null_count == 0 std = df[k].std() # drop near constant columns if not np.isfinite(std) or std < 1e-4: del df[k] print('drop near constant', k) else: df[k] = df[k].scale() # Expand categorical columns for k in cat_cols: cats = uniques[k][1:] # drop first df = df.one_hot_encoding(k, prefix=k, cats=cats) del df[k] # Print dtypes assert {df[k].dtype for k in df.columns} == {np.dtype('float64')} mat = df.as_matrix() assert mat.max() == 1 assert mat.min() == 0
def test_df_cat_sort_index(): df = DataFrame() df['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc')) df['b'] = np.arange(len(df)) got = df.set_index('a').sort_index() expect = df.to_pandas().set_index('a').sort_index() assert list(expect.columns) == list(got.columns) assert list(expect.index.values) == list(got.index.values) np.testing.assert_array_equal(expect.index.values, got.index.values) np.testing.assert_array_equal(expect['b'].values, got['b'].to_array())
def test_df_set_index_from_series(): df = DataFrame() df['a'] = list(range(10)) df['b'] = list(range(0, 20, 2)) # Check set_index(Series) df2 = df.set_index(df['b']) assert list(df2.columns) == ['a', 'b'] sliced_strided = df2.loc[2:6] print(sliced_strided) assert len(sliced_strided) == 3 assert list(sliced_strided.index.values) == [2, 4, 6]
def test_dataframe_nsmallest(nelem, n): np.random.seed(0) df = DataFrame() df['a'] = aa = np.random.random(nelem) df['b'] = bb = np.random.random(nelem) res = df.nsmallest(n, 'a') # Check inds = np.argsort(-aa) np.testing.assert_array_equal(res['a'].to_array(), aa[inds][-n:][::-1]) np.testing.assert_array_equal(res['b'].to_array(), bb[inds][-n:][::-1]) np.testing.assert_array_equal(res.index.values, inds[-n:][::-1])
def test_query_env_changing(): df = DataFrame() df['a'] = aa = np.arange(100) expr = 'a < @c' # first attempt c = 10 got = df.query(expr) np.testing.assert_array_equal(aa[aa < c], got['a'].to_array()) # change env c = 50 got = df.query(expr) np.testing.assert_array_equal(aa[aa < c], got['a'].to_array())
def test_nonmatching_index_setitem(nrows): np.random.seed(0) gdf = DataFrame() gdf['a'] = np.random.randint(2147483647, size=nrows) gdf['b'] = np.random.randint(2147483647, size=nrows) gdf = gdf.set_index('b') test_values = np.random.randint(2147483647, size=nrows) gdf['c'] = test_values assert (len(test_values) == len(gdf['c'])) assert (gdf['c'].to_pandas().equals( Series(test_values).set_index(gdf._index).to_pandas()))
def test_dataframe_masked_slicing(nelem, slice_start, slice_end): gdf = DataFrame() gdf['a'] = list(range(nelem)) gdf['b'] = list(range(nelem, 2 * nelem)) gdf['a'] = gdf['a'].set_mask(utils.random_bitmask(nelem)) gdf['b'] = gdf['b'].set_mask(utils.random_bitmask(nelem)) def do_slice(x): return x[slice_start:slice_end] expect = do_slice(gdf.to_pandas()) got = do_slice(gdf).to_pandas() pd.testing.assert_frame_equal(expect, got)
def test_to_pandas(): df = DataFrame() df['a'] = np.arange(10, dtype=np.int32) df['b'] = np.arange(10, 20, dtype=np.float64) pdf = df.to_pandas() assert tuple(df.columns) == tuple(pdf.columns) assert df['a'].dtype == pdf['a'].dtype assert df['b'].dtype == pdf['b'].dtype assert len(df['a']) == len(pdf['a']) assert len(df['b']) == len(pdf['b'])
def test_df_set_index_from_name(): df = DataFrame() df['a'] = list(range(10)) df['b'] = list(range(0, 20, 2)) # Check set_index(column_name) df2 = df.set_index('b') print(df2) # 1 less column because 'b' is used as index assert list(df2.columns) == ['a'] sliced_strided = df2.loc[2:6] print(sliced_strided) assert len(sliced_strided) == 3 assert list(sliced_strided.index.values) == [2, 4, 6]
def test_onehot_random(): df = DataFrame() low = 10 high = 17 size = 10 df['src'] = src = np.random.randint(low=low, high=high, size=size) df2 = df.one_hot_encoding(column='src', prefix='out_', cats=tuple(range(10, 17))) mat = df2.as_matrix(columns=df2.columns[1:]) for val in range(low, high): colidx = val - low arr = mat[:, colidx] mask = src == val np.testing.assert_equal(arr, mask)
def test_query(data, fn): # prepare nelem, seed = data expect_fn, query_expr = fn np.random.seed(seed) df = DataFrame() df['a'] = aa = np.arange(nelem) df['b'] = bb = np.random.random(nelem) * nelem # udt expect_mask = expect_fn(aa, bb) df2 = df.query(query_expr) # check assert len(df2) == np.count_nonzero(expect_mask) np.testing.assert_array_almost_equal(df2['a'].to_array(), aa[expect_mask]) np.testing.assert_array_almost_equal(df2['b'].to_array(), bb[expect_mask])
def test_categorical_value_counts(num_elements): from string import ascii_letters, digits # create categorical series np.random.seed(12) pd_cat = pd.Categorical( pd.Series( np.random.choice(list(ascii_letters + digits), num_elements), dtype='category' ) ) # gdf gdf = DataFrame() gdf['a'] = Series.from_categorical(pd_cat) gdf_value_counts = gdf['a'].value_counts() # pandas pdf = pd.DataFrame() pdf['a'] = pd_cat pdf_value_counts = pdf['a'].value_counts() # verify pandas_dict = pdf_value_counts.to_dict() gdf_dict = gdf_value_counts.to_pandas().to_dict() assert pandas_dict == gdf_dict
def test_dataframe_hash_partition_masked_value(nrows): gdf = DataFrame() gdf['key'] = np.arange(nrows) gdf['val'] = np.arange(nrows) + 100 bitmask = utils.random_bitmask(nrows) bytemask = utils.expand_bits_to_bytes(bitmask) gdf['val'] = gdf['val'].set_mask(bitmask) parted = gdf.partition_by_hash(['key'], nparts=3) # Verify that the valid mask is correct for p in parted: df = p.to_pandas() for row in df.itertuples(): valid = bool(bytemask[row.key]) expected_value = row.key + 100 if valid else -1 got_value = row.val assert expected_value == got_value
def test_groupby_iterate_groups(): np.random.seed(0) df = DataFrame() nelem = 20 df['key1'] = np.random.randint(0, 3, nelem) df['key2'] = np.random.randint(0, 2, nelem) df['val1'] = np.random.random(nelem) df['val2'] = np.random.random(nelem) def assert_values_equal(arr): np.testing.assert_array_equal(arr[0], arr) for grp in df.groupby(['key1', 'key2']): pddf = grp.to_pandas() for k in 'key1,key2'.split(','): assert_values_equal(pddf[k].values)
def test_dataframe_to_string_wide(): # Test basic df = DataFrame() for i in range(100): df['a{}'.format(i)] = list(range(3)) got = df.to_string(ncols=8) print(got) expect = ''' a0 a1 a2 a3 a4 a5 a6 ... a99 0 0 0 0 0 0 0 0 ... 0 1 1 1 1 1 1 1 1 ... 1 2 2 2 2 2 2 2 2 ... 2 [92 more columns] ''' # values should match despite whitespace difference assert got.split() == expect.split()
def test_onehot_generic_index(): np.random.seed(0) size = 33 indices = np.random.randint(low=0, high=100, size=size) df = DataFrame() values = np.random.randint(low=0, high=4, size=size) df['fo'] = Series(values, index=GenericIndex(indices)) out = df.one_hot_encoding('fo', cats=df.fo.unique(), prefix='fo', dtype=np.int32) assert set(out.columns) == {'fo', 'fo_0', 'fo_1', 'fo_2', 'fo_3'} np.testing.assert_array_equal(values == 0, out.fo_0.to_array()) np.testing.assert_array_equal(values == 1, out.fo_1.to_array()) np.testing.assert_array_equal(values == 2, out.fo_2.to_array()) np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
def test_dataframe_join_cats(): ldf = DataFrame() ldf['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc')) ldf['b'] = bb = np.arange(len(ldf)) lhs = ldf.set_index('a') rdf = DataFrame() rdf['a'] = pd.Categorical(list('abcac'), categories=list('abc')) rdf['c'] = cc = np.arange(len(rdf)) rhs = rdf.set_index('a') got = lhs.join(rhs) # Just do some rough checking here. # Note: pandas fails to join on categorical index. assert list(got.columns) == ['b', 'c'] assert len(got) > 0 assert set(got.index.values) & set('abc') assert set(got['b']) & set(bb) assert set(got['c']) & set(cc)
def test_from_pandas(): pdf = pd.DataFrame() pdf['a'] = np.arange(10, dtype=np.int32) pdf['b'] = np.arange(10, 20, dtype=np.float64) df = DataFrame.from_pandas(pdf) assert tuple(df.columns) == tuple(pdf.columns) assert df['a'].dtype == pdf['a'].dtype assert df['b'].dtype == pdf['b'].dtype assert len(df['a']) == len(pdf['a']) assert len(df['b']) == len(pdf['b'])
def test_from_pandas_ex1(): pdf = pd.DataFrame({'a': [0, 1, 2, 3], 'b': [0.1, 0.2, None, 0.3]}) print(pdf) df = DataFrame.from_pandas(pdf) print(df) assert tuple(df.columns) == tuple(pdf.columns) assert np.all(df['a'].to_array() == pdf['a']) matches = df['b'].to_array() == pdf['b'] # the 3d element is False due to (nan == nan) == False assert np.all(matches == [True, True, False, True]) assert np.isnan(df['b'].to_array()[2]) assert np.isnan(pdf['b'][2])
def test_onehot_simple(): np.random.seed(0) df = DataFrame() # Populate with data [0, 10) df['vals'] = np.arange(10, dtype=np.int32) # One Hot (Series) for i, col in enumerate(df['vals'].one_hot_encoding(list(range(10)))): arr = col.to_array() # Verify 1 in the right position np.testing.assert_equal(arr[i], 1) # Every other slots are 0s np.testing.assert_equal(arr[:i], 0) np.testing.assert_equal(arr[i + 1:], 0) # One Hot (DataFrame) df2 = df.one_hot_encoding(column='vals', prefix='vals', cats=list(range(10))) assert df2.columns[0] == 'vals' for i in range(1, len(df2.columns)): assert df2.columns[i] == 'vals_%s' % (i - 1) got = df2.as_matrix(columns=df2.columns[1:]) expect = np.identity(got.shape[0]) np.testing.assert_equal(got, expect)
def test_dataframe_join_how(aa, bb, how): df = DataFrame() df['a'] = aa df['b'] = bb def work(df): ts = timer() df1 = df.set_index('a') df2 = df.set_index('b') joined = df1.join(df2, how=how, sort=True) te = timer() print('timing', type(df), te - ts) return joined expect = work(df.to_pandas()) got = work(df) # print(expect) # print(got.to_string(nrows=None)) assert list(expect.columns) == list(got.columns) assert np.all(expect.index.values == got.index.values) _check_series(expect['b'], got['b']) _check_series(expect['a'], got['a'])
def test_dataframe_join_suffix(): np.random.seed(0) df = DataFrame() for k in 'abc': df[k] = np.random.randint(0, 5, 5) left = df.set_index('a') right = df.set_index('c') with pytest.raises(ValueError) as raises: left.join(right) raises.match("there are overlapping columns but lsuffix" " and rsuffix are not defined") got = left.join(right, lsuffix='_left', rsuffix='_right') # Get expected value pddf = df.to_pandas() expect = pddf.set_index('a').join(pddf.set_index('c'), lsuffix='_left', rsuffix='_right') # Check assert list(expect.columns) == list(got.columns) assert np.all(expect.index.values == got.index.values) for k in expect.columns: _check_series(expect[k], got[k])
def test_from_records_noindex(columns): recdtype = np.dtype([ ('a', np.int32), ('b', np.float64), ]) rec = np.recarray(10, dtype=recdtype) rec.a = aa = np.arange(10, dtype=np.int32) rec.b = bb = np.arange(10, 20, dtype=np.float64) df = DataFrame.from_records(rec, columns=columns) if columns and 'a' in columns: np.testing.assert_array_equal(aa, df['a']) if columns and 'b' in columns: np.testing.assert_array_equal(bb, df['b']) np.testing.assert_array_equal(np.arange(10), df.index.values)