def test_to_html_compat(self): df = (tm.makeCustomDataframe( 4, 3, data_gen_f=lambda *args: np.random.rand(), c_idx_names=False, r_idx_names=False, ).applymap("{:.3f}".format).astype(float)) out = df.to_html() res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0] tm.assert_frame_equal(res, df)
def test_to_csv_idx_types(self, nrows, r_idx_type, c_idx_type, ncols): df = tm.makeCustomDataframe( nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type ) result, expected = self._return_result_expected( df, 1000, r_idx_type, c_idx_type, ) tm.assert_frame_equal(result, expected, check_names=False)
def test_to_csv_dups_cols(self): df = DataFrame( np.random.randn(1000, 30), columns=list(range(15)) + list(range(15)), dtype="float64", ) with tm.ensure_clean() as filename: df.to_csv(filename) # single dtype, fine result = read_csv(filename, index_col=0) result.columns = df.columns tm.assert_frame_equal(result, df) df_float = DataFrame(np.random.randn(1000, 3), dtype="float64") df_int = DataFrame(np.random.randn(1000, 3), dtype="int64") df_bool = DataFrame(True, index=df_float.index, columns=range(3)) df_object = DataFrame("foo", index=df_float.index, columns=range(3)) df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3)) df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True) cols = [] for i in range(5): cols.extend([0, 1, 2]) df.columns = cols with tm.ensure_clean() as filename: df.to_csv(filename) result = read_csv(filename, index_col=0) # date cols for i in ["0.4", "1.4", "2.4"]: result[i] = to_datetime(result[i]) result.columns = df.columns tm.assert_frame_equal(result, df) # GH3457 N = 10 df = tm.makeCustomDataframe(N, 3) df.columns = ["a", "a", "b"] with tm.ensure_clean() as filename: df.to_csv(filename) # read_csv will rename the dups columns result = read_csv(filename, index_col=0) result = result.rename(columns={"a.1": "a"}) tm.assert_frame_equal(result, df)
def test_to_csv_dup_cols(self, nrows): df = tm.makeCustomDataframe(nrows, 3) cols = list(df.columns) cols[:2] = ["dupe", "dupe"] cols[-2:] = ["dupe", "dupe"] ix = list(df.index) ix[:2] = ["rdupe", "rdupe"] ix[-2:] = ["rdupe", "rdupe"] df.index = ix df.columns = cols result, expected = self._return_result_expected(df, 1000, dupe_col=True) tm.assert_frame_equal(result, expected, check_names=False)
def test_concat_invalid(self): # trying to concat a ndframe with a non-ndframe df1 = tm.makeCustomDataframe(10, 2) for obj in [1, {}, [1, 2], (1, 2)]: msg = ( f"cannot concatenate object of type '{type(obj)}'; " "only Series and DataFrame objs are valid" ) with pytest.raises(TypeError, match=msg): concat([df1, obj])
def test_join_with_period_index(self, join_type): df = tm.makeCustomDataframe( 10, 10, data_gen_f=lambda *args: np.random.randint(2), c_idx_type="p", r_idx_type="dt", ) s = df.iloc[:5, 0] expected = df.columns.astype("O").join(s.index, how=join_type) result = df.columns.join(s.index, how=join_type) tm.assert_index_equal(expected, result)
def test_does_not_convert_mixed_integer(self): df = tm.makeCustomDataframe( 10, 10, data_gen_f=lambda *args, **kwargs: np.random.randn(), r_idx_type="i", c_idx_type="dt", ) cols = df.columns.join(df.index, how="outer") joined = cols.join(df.columns) assert cols.dtype == np.dtype("O") assert cols.dtype == joined.dtype tm.assert_numpy_array_equal(cols.values, joined.values)
def test_join_does_not_recur(self): df = tm.makeCustomDataframe( 3, 2, data_gen_f=lambda *args: np.random.randint(2), c_idx_type="p", r_idx_type="dt", ) s = df.iloc[:2, 0] res = s.index.join(df.columns, how="outer") expected = Index([s.index[0], s.index[1], df.columns[0], df.columns[1]], object) tm.assert_index_equal(res, expected)
def test_excel_010_hemstring( self, merge_cells, c_idx_nlevels, r_idx_nlevels, use_headers, path ): def roundtrip(data, header=True, parser_hdr=0, index=True): data.to_excel(path, header=header, merge_cells=merge_cells, index=index) with ExcelFile(path) as xf: return pd.read_excel( xf, sheet_name=xf.sheet_names[0], header=parser_hdr ) # Basic test. parser_header = 0 if use_headers else None res = roundtrip(DataFrame([0]), use_headers, parser_header) assert res.shape == (1, 2) assert res.iloc[0, 0] is not np.nan # More complex tests with multi-index. nrows = 5 ncols = 3 # ensure limited functionality in 0.10 # override of gh-2370 until sorted out in 0.11 df = tm.makeCustomDataframe( nrows, ncols, r_idx_nlevels=r_idx_nlevels, c_idx_nlevels=c_idx_nlevels ) # This if will be removed once multi-column Excel writing # is implemented. For now fixing gh-9794. if c_idx_nlevels > 1: msg = ( "Writing to Excel with MultiIndex columns and no index " "\\('index'=False\\) is not yet implemented." ) with pytest.raises(NotImplementedError, match=msg): roundtrip(df, use_headers, index=False) else: res = roundtrip(df, use_headers) if use_headers: assert res.shape == (nrows, ncols + r_idx_nlevels) else: # First row taken as columns. assert res.shape == (nrows - 1, ncols + r_idx_nlevels) # No NaNs. for r in range(len(res.index)): for c in range(len(res.columns)): assert res.iloc[r, c] is not np.nan
def test_to_csv_cols_reordering(self): # GH3454 chunksize = 5 N = int(chunksize * 2.5) df = tm.makeCustomDataframe(N, 3) cs = df.columns cols = [cs[2], cs[0]] with tm.ensure_clean() as path: df.to_csv(path, columns=cols, chunksize=chunksize) rs_c = read_csv(path, index_col=0) tm.assert_frame_equal(df[cols], rs_c, check_names=False)
def test_loc_empty_list_indexer_is_ok(self): df = tm.makeCustomDataframe(5, 2) # vertical empty tm.assert_frame_equal( df.loc[:, []], df.iloc[:, :0], check_index_type=True, check_column_type=True ) # horizontal empty tm.assert_frame_equal( df.loc[[], :], df.iloc[:0, :], check_index_type=True, check_column_type=True ) # horizontal empty tm.assert_frame_equal( df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True )
def test_excel_multindex_roundtrip( self, ext, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels, request ): # see gh-4679 with tm.ensure_clean(ext) as pth: if (c_idx_levels == 1 and c_idx_names) and not ( r_idx_levels == 3 and not r_idx_names ): mark = pytest.mark.xfail( reason="Column index name cannot be serialized unless " "it's a MultiIndex" ) request.node.add_marker(mark) # Empty name case current read in as # unnamed levels, not Nones. check_names = r_idx_names or r_idx_levels <= 1 df = tm.makeCustomDataframe( 5, 5, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels ) df.to_excel(pth) act = pd.read_excel( pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels)), ) tm.assert_frame_equal(df, act, check_names=check_names) df.iloc[0, :] = np.nan df.to_excel(pth) act = pd.read_excel( pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels)), ) tm.assert_frame_equal(df, act, check_names=check_names) df.iloc[-1, :] = np.nan df.to_excel(pth) act = pd.read_excel( pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels)), ) tm.assert_frame_equal(df, act, check_names=check_names)
def test_slice_locs_with_type_mismatch(self): df = tm.makeTimeDataFrame() stacked = df.stack() idx = stacked.index with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs((1, 3)) with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs(df.index[5] + timedelta(seconds=30), (5, 2)) df = tm.makeCustomDataframe(5, 5) stacked = df.stack() idx = stacked.index with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs(timedelta(seconds=30)) # TODO: Try creating a UnicodeDecodeError in exception message with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs(df.index[1], (16, "a"))
def test_to_csv_new_dupe_cols(self): import pandas as pd def _check_df(df, cols=None): with tm.ensure_clean() as path: df.to_csv(path, columns=cols, chunksize=chunksize) rs_c = pd.read_csv(path, index_col=0) # we wrote them in a different order # so compare them in that order if cols is not None: if df.columns.is_unique: rs_c.columns = cols else: indexer, missing = df.columns.get_indexer_non_unique( cols) rs_c.columns = df.columns.take(indexer) for c in cols: obj_df = df[c] obj_rs = rs_c[c] if isinstance(obj_df, Series): tm.assert_series_equal(obj_df, obj_rs) else: tm.assert_frame_equal(obj_df, obj_rs, check_names=False) # wrote in the same order else: rs_c.columns = df.columns tm.assert_frame_equal(df, rs_c, check_names=False) chunksize = 5 N = int(chunksize * 2.5) # dupe cols df = tm.makeCustomDataframe(N, 3) df.columns = ["a", "a", "b"] _check_df(df, None) # dupe cols with selection cols = ["b", "a"] _check_df(df, cols)
def test_header_multi_index(all_parsers): parser = all_parsers expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) data = """\ C0,,C_l0_g0,C_l0_g1,C_l0_g2 C1,,C_l1_g0,C_l1_g1,C_l1_g2 C2,,C_l2_g0,C_l2_g1,C_l2_g2 C3,,C_l3_g0,C_l3_g1,C_l3_g2 R0,R1,,, R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1]) tm.assert_frame_equal(result, expected)
def test_register_writer(self): class DummyClass(ExcelWriter): called_save = False called_write_cells = False called_sheets = False _supported_extensions = ("xlsx", "xls") _engine = "dummy" def book(self): pass def _save(self): type(self).called_save = True def _write_cells(self, *args, **kwargs): type(self).called_write_cells = True @property def sheets(self): type(self).called_sheets = True @classmethod def assert_called_and_reset(cls): assert cls.called_save assert cls.called_write_cells assert not cls.called_sheets cls.called_save = False cls.called_write_cells = False register_writer(DummyClass) with option_context("io.excel.xlsx.writer", "dummy"): path = "something.xlsx" with tm.ensure_clean(path) as filepath: with ExcelWriter(filepath) as writer: assert isinstance(writer, DummyClass) df = tm.makeCustomDataframe(1, 1) df.to_excel(filepath) DummyClass.assert_called_and_reset() with tm.ensure_clean("something.xls") as filepath: df.to_excel(filepath, engine="dummy") DummyClass.assert_called_and_reset()
def test_tabulator_dataframe_replace_data(document, comm): df = makeMixedDataFrame() table = Tabulator(df) model = table.get_root(document, comm) table.value = makeCustomDataframe(2, 2) assert len(model.columns) == 3 c1, c2, c3 = model.columns assert c1.field == 'R0' assert c2.field == 'C_l0_g0' assert c3.field == 'C_l0_g1' assert model.configuration == { 'columns': [{'field': 'R0'}, {'field': 'C_l0_g0'}, {'field': 'C_l0_g1'}], 'selectable': True } expected = { 'C_l0_g0': np.array(['R0C0', 'R1C0'], dtype=object), 'C_l0_g1': np.array(['R0C1', 'R1C1'], dtype=object), 'R0': np.array(['R_l0_g0', 'R_l0_g1'], dtype=object) } for col, values in model.source.data.items(): np.testing.assert_array_equal(values, expected[col])
def make_dataframe( nrows: int, ncols: int, data_gen_f: Optional[Callable[[int, int], Any]] = None) -> DataFrame: """Local mapping of `pandas._testing.makeCustomDataframe`. Resulting `DataFrame` will have neither a columns name nor an index name. Indices will be a zero-based integer list. Parameter names and descriptions are based on those found in `pandas._testing.py`. https://github.com/pandas-dev/pandas/blob/b687cd4d9e520666a956a60849568a98dd00c672/pandas/_testing.py#L1956 Args: nrows (int): Number of rows. ncols (int): Number of columns. data_gen_f (func): Function f(row,col) that returns a value for the given position. Returns: DataFrame: Generated `DataFrame` object. """ from pandas._testing import makeCustomDataframe # pandas bug (?) in makeCustomIndex when nentries = 1 if ncols == 1: return DataFrame( {"C_l0_g0": [make_dataframe_value(x, 0) for x in range(nrows)]}) return makeCustomDataframe( nrows, ncols, c_idx_names=False, r_idx_names=False, data_gen_f=data_gen_f, r_idx_type="i", )
def df(request): data_type = request.param if data_type == "delims": return pd.DataFrame({"a": ['"a,\t"b|c', "d\tef´"], "b": ["hi'j", "k''lm"]}) elif data_type == "utf8": return pd.DataFrame({"a": ["µasd", "Ωœ∑´"], "b": ["øπ∆˚¬", "œ∑´®"]}) elif data_type == "utf16": return pd.DataFrame( {"a": ["\U0001f44d\U0001f44d", "\U0001f44d\U0001f44d"], "b": ["abc", "def"]} ) elif data_type == "string": return tm.makeCustomDataframe( 5, 3, c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None] ) elif data_type == "long": max_rows = get_option("display.max_rows") return tm.makeCustomDataframe( max_rows + 1, 3, data_gen_f=lambda *args: randint(2), c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None], ) elif data_type == "nonascii": return pd.DataFrame({"en": "in English".split(), "es": "en español".split()}) elif data_type == "colwidth": _cw = get_option("display.max_colwidth") + 1 return tm.makeCustomDataframe( 5, 3, data_gen_f=lambda *args: "x" * _cw, c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None], ) elif data_type == "mixed": return DataFrame( { "a": np.arange(1.0, 6.0) + 0.01, "b": np.arange(1, 6).astype(np.int64), "c": list("abcde"), } ) elif data_type == "float": return tm.makeCustomDataframe( 5, 3, data_gen_f=lambda r, c: float(r) + 0.01, c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None], ) elif data_type == "int": return tm.makeCustomDataframe( 5, 3, data_gen_f=lambda *args: randint(2), c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None], ) else: raise ValueError
def test_to_csv_nrows(self, nrows): df = tm.makeCustomDataframe(nrows, 4, r_idx_type="dt", c_idx_type="s") result, expected = self._return_result_expected(df, 1000, "dt", "s") tm.assert_frame_equal(result, expected, check_names=False)
def test_dups_fancy_indexing(self): # GH 3455 df = tm.makeCustomDataframe(10, 3) df.columns = ["a", "a", "b"] result = df[["b", "a"]].columns expected = Index(["b", "a", "a"]) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa")) df.head() str(df) result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]]) result.columns = list("aaaaaaa") # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( { "test": [5, 7, 9, 11], "test1": [4.0, 5, 6, 7], "other": list("abcd") }, index=["A", "A", "B", "C"], ) rows = ["C", "B"] expected = DataFrame( { "test": [11, 9], "test1": [7.0, 6], "other": ["d", "c"] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ["C", "B", "E"] with pytest.raises(KeyError, match="with any missing labels"): df.loc[rows] # see GH5553, make sure we use the right indexer rows = ["F", "G", "H", "C", "B", "E"] with pytest.raises(KeyError, match="with any missing labels"): df.loc[rows] # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD")) with pytest.raises( KeyError, match=re.escape( "\"None of [Index(['E'], dtype='object')] are in the [index]\"" ), ): dfnu.loc[["E"]] # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) with pytest.raises(KeyError, match="with any missing labels"): df.loc[[0, 8, 0]] df = DataFrame({"A": list("abc")}) with pytest.raises(KeyError, match="with any missing labels"): df.loc[[0, 8, 0]] # non unique with non unique selector df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"]) with pytest.raises(KeyError, match="with any missing labels"): df.loc[["A", "A", "E"]]
def test_to_csv_chunksize(self): chunksize = 1000 df = tm.makeCustomDataframe(chunksize // 2 + 1, 2, r_idx_nlevels=2) result, expected = self._return_result_expected(df, chunksize, rnlvl=2) tm.assert_frame_equal(result, expected, check_names=False)
def test_to_csv_multiindex(self, float_frame, datetime_frame): frame = float_frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: frame.to_csv(path, header=False) frame.to_csv(path, columns=["A", "B"]) # round trip frame.to_csv(path) df = self.read_csv(path, index_col=[0, 1], parse_dates=False) # TODO to_csv drops column name tm.assert_frame_equal(frame, df, check_names=False) assert frame.index.names == df.index.names # needed if setUp becomes a class method float_frame.index = old_index # try multiindex with dates tsframe = datetime_frame old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_csv(path, index_label=["time", "foo"]) recons = self.read_csv(path, index_col=[0, 1]) # TODO to_csv drops column name tm.assert_frame_equal(tsframe, recons, check_names=False) # do not load index tsframe.to_csv(path) recons = self.read_csv(path, index_col=None) assert len(recons.columns) == len(tsframe.columns) + 2 # no index tsframe.to_csv(path, index=False) recons = self.read_csv(path, index_col=None) tm.assert_almost_equal(recons.values, datetime_frame.values) # needed if setUp becomes class method datetime_frame.index = old_index with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: # GH3571, GH1651, GH3141 def _make_frame(names=None): if names is True: names = ["first", "second"] return DataFrame( np.random.randint(0, 10, size=(3, 3)), columns=MultiIndex.from_tuples( [("bah", "foo"), ("bah", "bar"), ("ban", "baz")], names=names ), dtype="int64", ) # column & index are multi-index df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1]) tm.assert_frame_equal(df, result) # column is mi df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=1, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=0) tm.assert_frame_equal(df, result) # dup column names? df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=3, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2]) tm.assert_frame_equal(df, result) # writing with no index df = _make_frame() df.to_csv(path, index=False) result = read_csv(path, header=[0, 1]) tm.assert_frame_equal(df, result) # we lose the names here df = _make_frame(True) df.to_csv(path, index=False) result = read_csv(path, header=[0, 1]) assert com.all_none(*result.columns.names) result.columns.names = df.columns.names tm.assert_frame_equal(df, result) # whatsnew example df = _make_frame() df.to_csv(path) result = read_csv(path, header=[0, 1], index_col=[0]) tm.assert_frame_equal(df, result) df = _make_frame(True) df.to_csv(path) result = read_csv(path, header=[0, 1], index_col=[0]) tm.assert_frame_equal(df, result) # invalid options df = _make_frame(True) df.to_csv(path) for i in [6, 7]: msg = f"len of {i}, but only 5 lines in file" with pytest.raises(ParserError, match=msg): read_csv(path, header=list(range(i)), index_col=0) # write with cols msg = "cannot specify cols with a MultiIndex" with pytest.raises(TypeError, match=msg): df.to_csv(path, columns=["foo", "bar"]) with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: # empty tsframe[:0].to_csv(path) recons = self.read_csv(path) exp = tsframe[:0] exp.index = [] tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0
def test_to_csv_moar(self): def _do_test( df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False ): kwargs = dict(parse_dates=False) if cnlvl: if rnlvl is not None: kwargs["index_col"] = list(range(rnlvl)) kwargs["header"] = list(range(cnlvl)) with tm.ensure_clean("__tmp_to_csv_moar__") as path: df.to_csv(path, encoding="utf8", chunksize=chunksize) recons = self.read_csv(path, **kwargs) else: kwargs["header"] = 0 with tm.ensure_clean("__tmp_to_csv_moar__") as path: df.to_csv(path, encoding="utf8", chunksize=chunksize) recons = self.read_csv(path, **kwargs) def _to_uni(x): if not isinstance(x, str): return x.decode("utf8") return x if dupe_col: # read_Csv disambiguates the columns by # labeling them dupe.1,dupe.2, etc'. monkey patch columns recons.columns = df.columns if rnlvl and not cnlvl: delta_lvl = [recons.iloc[:, i].values for i in range(rnlvl - 1)] ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) recons.index = ix recons = recons.iloc[:, rnlvl - 1 :] type_map = dict(i="i", f="f", s="O", u="O", dt="O", p="O") if r_dtype: if r_dtype == "u": # unicode r_dtype = "O" recons.index = np.array( [_to_uni(label) for label in recons.index], dtype=r_dtype ) df.index = np.array( [_to_uni(label) for label in df.index], dtype=r_dtype ) elif r_dtype == "dt": # unicode r_dtype = "O" recons.index = np.array( [Timestamp(label) for label in recons.index], dtype=r_dtype ) df.index = np.array( [Timestamp(label) for label in df.index], dtype=r_dtype ) elif r_dtype == "p": r_dtype = "O" idx_list = to_datetime(recons.index) recons.index = np.array( [Timestamp(label) for label in idx_list], dtype=r_dtype ) df.index = np.array( list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype ) else: r_dtype = type_map.get(r_dtype) recons.index = np.array(recons.index, dtype=r_dtype) df.index = np.array(df.index, dtype=r_dtype) if c_dtype: if c_dtype == "u": c_dtype = "O" recons.columns = np.array( [_to_uni(label) for label in recons.columns], dtype=c_dtype ) df.columns = np.array( [_to_uni(label) for label in df.columns], dtype=c_dtype ) elif c_dtype == "dt": c_dtype = "O" recons.columns = np.array( [Timestamp(label) for label in recons.columns], dtype=c_dtype ) df.columns = np.array( [Timestamp(label) for label in df.columns], dtype=c_dtype ) elif c_dtype == "p": c_dtype = "O" col_list = to_datetime(recons.columns) recons.columns = np.array( [Timestamp(label) for label in col_list], dtype=c_dtype ) col_list = df.columns.to_timestamp() df.columns = np.array( [Timestamp(label) for label in col_list], dtype=c_dtype ) else: c_dtype = type_map.get(c_dtype) recons.columns = np.array(recons.columns, dtype=c_dtype) df.columns = np.array(df.columns, dtype=c_dtype) tm.assert_frame_equal(df, recons, check_names=False) N = 100 chunksize = 1000 ncols = 4 base = chunksize // ncols for nrows in [ 2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1, ]: _do_test( tm.makeCustomDataframe(nrows, ncols, r_idx_type="dt", c_idx_type="s"), "dt", "s", ) for r_idx_type, c_idx_type in [("i", "i"), ("s", "s"), ("u", "dt"), ("p", "p")]: for ncols in [1, 2, 3, 4]: base = chunksize // ncols for nrows in [ 2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1, ]: _do_test( tm.makeCustomDataframe( nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type ), r_idx_type, c_idx_type, ) for ncols in [1, 2, 3, 4]: base = chunksize // ncols for nrows in [ 10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1, ]: _do_test(tm.makeCustomDataframe(nrows, ncols)) for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]: df = tm.makeCustomDataframe(nrows, 3) cols = list(df.columns) cols[:2] = ["dupe", "dupe"] cols[-2:] = ["dupe", "dupe"] ix = list(df.index) ix[:2] = ["rdupe", "rdupe"] ix[-2:] = ["rdupe", "rdupe"] df.index = ix df.columns = cols _do_test(df, dupe_col=True) _do_test(DataFrame(index=np.arange(10))) _do_test( tm.makeCustomDataframe(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2 ) for ncols in [2, 3, 4]: base = int(chunksize // ncols) for nrows in [ 10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1, ]: _do_test(tm.makeCustomDataframe(nrows, ncols, r_idx_nlevels=2), rnlvl=2) _do_test(tm.makeCustomDataframe(nrows, ncols, c_idx_nlevels=2), cnlvl=2) _do_test( tm.makeCustomDataframe( nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2 ), rnlvl=2, cnlvl=2, )
def test_to_csv_params(self, nrows, df_params, func_params, ncols): df = tm.makeCustomDataframe(nrows, ncols, **df_params) result, expected = self._return_result_expected( df, 1000, **func_params) tm.assert_frame_equal(result, expected, check_names=False)
def test_select_dtypes_typecodes(self): # GH 11990 df = tm.makeCustomDataframe(30, 3, data_gen_f=lambda x, y: np.random.random()) expected = df FLOAT_TYPES = list(np.typecodes["AllFloat"]) tm.assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)