def test_register_writer(self): # some awkward mocking to test out dispatch and such actually works called_save = [] called_write_cells = [] class DummyClass(ExcelWriter): called_save = False called_write_cells = False supported_extensions = ['xlsx', 'xls'] engine = 'dummy' def save(self): called_save.append(True) def write_cells(self, *args, **kwargs): called_write_cells.append(True) def check_called(func): func() assert len(called_save) >= 1 assert len(called_write_cells) >= 1 del called_save[:] del called_write_cells[:] with pd.option_context('io.excel.xlsx.writer', 'dummy'): register_writer(DummyClass) writer = ExcelWriter('something.xlsx') assert isinstance(writer, DummyClass) df = tm.makeCustomDataframe(1, 1) check_called(lambda: df.to_excel('something.xlsx')) check_called( lambda: df.to_excel( 'something.xls', engine='dummy'))
def test_join_with_period_index(self, join_type): df = tm.makeCustomDataframe( 10, 10, data_gen_f=lambda *args: np.random.randint(2), c_idx_type='p', r_idx_type='dt') s = df.iloc[:5, 0] msg = 'can only call with other PeriodIndex-ed objects' with pytest.raises(ValueError, match=msg): df.columns.join(s.index, how=join_type)
def test_does_not_convert_mixed_integer(self): df = tm.makeCustomDataframe(10, 10, data_gen_f=lambda *args, **kwargs: randn(), r_idx_type="i", c_idx_type="td") str(df) cols = df.columns.join(df.index, how="outer") joined = cols.join(df.columns) self.assertEqual(cols.dtype, np.dtype("O")) self.assertEqual(cols.dtype, joined.dtype) tm.assert_index_equal(cols, joined)
def test_join_with_period_index(self, join_type): df = tm.makeCustomDataframe( 10, 10, data_gen_f=lambda *args: np.random.randint(2), c_idx_type='p', r_idx_type='dt') s = df.iloc[:5, 0] expected = df.columns.astype('O').join(s.index, how=join_type) result = df.columns.join(s.index, how=join_type) tm.assert_index_equal(expected, result)
def test_does_not_convert_mixed_integer(self): df = tm.makeCustomDataframe(10, 10, data_gen_f=lambda *args, **kwargs: randn(), r_idx_type='i', c_idx_type='dt') cols = df.columns.join(df.index, how='outer') joined = cols.join(df.columns) assert cols.dtype == np.dtype('O') assert cols.dtype == joined.dtype tm.assert_numpy_array_equal(cols.values, joined.values)
def test_join_does_not_recur(self): df = tm.makeCustomDataframe( 3, 2, data_gen_f=lambda *args: np.random.randint(2), c_idx_type='p', r_idx_type='dt') s = df.iloc[:2, 0] res = s.index.join(df.columns, how='outer') expected = Index([s.index[0], s.index[1], df.columns[0], df.columns[1]], object) tm.assert_index_equal(res, expected)
def test_join_with_period_index(self, join_type): df = tm.makeCustomDataframe( 10, 10, data_gen_f=lambda *args: np.random.randint(2), c_idx_type='p', r_idx_type='dt') s = df.iloc[:5, 0] with tm.assert_raises_regex(ValueError, 'can only call with other ' 'PeriodIndex-ed objects'): df.columns.join(s.index, how=join_type)
def test_join_with_period_index(self): df = tm.makeCustomDataframe( 10, 10, data_gen_f=lambda *args: np.random.randint(2), c_idx_type='p', r_idx_type='dt') s = df.iloc[:5, 0] joins = 'left', 'right', 'inner', 'outer' for join in joins: with tm.assertRaisesRegexp(ValueError, 'can only call with other ' 'PeriodIndex-ed objects'): df.columns.join(s.index, how=join)
def test_header_multi_index(self): expected = tm.makeCustomDataframe( 5, 3, r_idx_nlevels=2, c_idx_nlevels=4) data = """\ C0,,C_l0_g0,C_l0_g1,C_l0_g2 C1,,C_l1_g0,C_l1_g1,C_l1_g2 C2,,C_l2_g0,C_l2_g1,C_l2_g2 C3,,C_l3_g0,C_l3_g1,C_l3_g2 R0,R1,,, R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[ 0, 1], tupleize_cols=False) tm.assert_frame_equal(df, expected) # skipping lines in the header df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[ 0, 1], tupleize_cols=False) tm.assert_frame_equal(df, expected) # INVALID OPTIONS # no as_recarray with tm.assert_produces_warning( FutureWarning, check_stacklevel=False): pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], as_recarray=True, tupleize_cols=False) # names pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], names=['foo', 'bar'], tupleize_cols=False) # usecols pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], usecols=['foo', 'bar'], tupleize_cols=False) # non-numeric index_col pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], index_col=['foo', 'bar'], tupleize_cols=False)
def test_slice_locs_with_type_mismatch(): df = tm.makeTimeDataFrame() stacked = df.stack() idx = stacked.index with pytest.raises(TypeError, match='^Level type mismatch'): idx.slice_locs((1, 3)) with pytest.raises(TypeError, match='^Level type mismatch'): idx.slice_locs(df.index[5] + timedelta(seconds=30), (5, 2)) df = tm.makeCustomDataframe(5, 5) stacked = df.stack() idx = stacked.index with pytest.raises(TypeError, match='^Level type mismatch'): idx.slice_locs(timedelta(seconds=30)) # TODO: Try creating a UnicodeDecodeError in exception message with pytest.raises(TypeError, match='^Level type mismatch'): idx.slice_locs(df.index[1], (16, "a"))
def test_header_multi_index(self): expected = tm.makeCustomDataframe( 5, 3, r_idx_nlevels=2, c_idx_nlevels=4) data = """\ C0,,C_l0_g0,C_l0_g1,C_l0_g2 C1,,C_l1_g0,C_l1_g1,C_l1_g2 C2,,C_l2_g0,C_l2_g1,C_l2_g2 C3,,C_l3_g0,C_l3_g1,C_l3_g2 R0,R1,,, R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1]) tm.assert_frame_equal(df, expected) # skipping lines in the header df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1]) tm.assert_frame_equal(df, expected) # INVALID OPTIONS # names pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], names=['foo', 'bar']) # usecols pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], usecols=['foo', 'bar']) # non-numeric index_col pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], index_col=['foo', 'bar'])
def test_header_multi_index(all_parsers): parser = all_parsers expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) data = """\ C0,,C_l0_g0,C_l0_g1,C_l0_g2 C1,,C_l1_g0,C_l1_g1,C_l1_g2 C2,,C_l2_g0,C_l2_g1,C_l2_g2 C3,,C_l3_g0,C_l3_g1,C_l3_g2 R0,R1,,, R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1]) tm.assert_frame_equal(result, expected)
def test_header_multi_index(all_parsers): parser = all_parsers expected = tm.makeCustomDataframe( 5, 3, r_idx_nlevels=2, c_idx_nlevels=4) data = """\ C0,,C_l0_g0,C_l0_g1,C_l0_g2 C1,,C_l1_g0,C_l1_g1,C_l1_g2 C2,,C_l2_g0,C_l2_g1,C_l2_g2 C3,,C_l3_g0,C_l3_g1,C_l3_g2 R0,R1,,, R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1]) tm.assert_frame_equal(result, expected)
def test_iloc_empty_list_indexer_is_ok(self): df = tm.makeCustomDataframe(5, 2) # vertical empty tm.assert_frame_equal( df.iloc[:, []], df.iloc[:, :0], check_index_type=True, check_column_type=True, ) # horizontal empty tm.assert_frame_equal( df.iloc[[], :], df.iloc[:0, :], check_index_type=True, check_column_type=True, ) # horizontal empty tm.assert_frame_equal( df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True )
def test_register_writer(self): # some awkward mocking to test out dispatch and such actually works called_save = [] called_write_cells = [] class DummyClass(ExcelWriter): called_save = False called_write_cells = False supported_extensions = ['test', 'xlsx', 'xls'] engine = 'dummy' def save(self): called_save.append(True) def write_cells(self, *args, **kwargs): called_write_cells.append(True) def check_called(func): func() self.assert_(len(called_save) >= 1) self.assert_(len(called_write_cells) >= 1) del called_save[:] del called_write_cells[:] register_writer(DummyClass) writer = ExcelWriter('something.test') tm.assert_isinstance(writer, DummyClass) df = tm.makeCustomDataframe(1, 1) panel = tm.makePanel() func = lambda: df.to_excel('something.test') check_called(func) check_called(lambda: panel.to_excel('something.test')) from pandas import set_option, get_option val = get_option('io.excel.xlsx.writer') set_option('io.excel.xlsx.writer', 'dummy') check_called(lambda: df.to_excel('something.xlsx')) check_called(lambda: df.to_excel('something.xls', engine='dummy')) set_option('io.excel.xlsx.writer', val)
def test_to_csv_multiindex(self, float_frame, datetime_frame): frame = float_frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: frame.to_csv(path, header=False) frame.to_csv(path, columns=["A", "B"]) # round trip frame.to_csv(path) df = self.read_csv(path, index_col=[0, 1], parse_dates=False) # TODO to_csv drops column name tm.assert_frame_equal(frame, df, check_names=False) assert frame.index.names == df.index.names # needed if setUp becomes a class method float_frame.index = old_index # try multiindex with dates tsframe = datetime_frame old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_csv(path, index_label=["time", "foo"]) recons = self.read_csv(path, index_col=[0, 1]) # TODO to_csv drops column name tm.assert_frame_equal(tsframe, recons, check_names=False) # do not load index tsframe.to_csv(path) recons = self.read_csv(path, index_col=None) assert len(recons.columns) == len(tsframe.columns) + 2 # no index tsframe.to_csv(path, index=False) recons = self.read_csv(path, index_col=None) tm.assert_almost_equal(recons.values, datetime_frame.values) # needed if setUp becomes class method datetime_frame.index = old_index with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: # GH3571, GH1651, GH3141 def _make_frame(names=None): if names is True: names = ["first", "second"] return DataFrame( np.random.randint(0, 10, size=(3, 3)), columns=MultiIndex.from_tuples([("bah", "foo"), ("bah", "bar"), ("ban", "baz")], names=names), dtype="int64", ) # column & index are multi-index df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1]) tm.assert_frame_equal(df, result) # column is mi df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=1, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=0) tm.assert_frame_equal(df, result) # dup column names? df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=3, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2]) tm.assert_frame_equal(df, result) # writing with no index df = _make_frame() df.to_csv(path, index=False) result = read_csv(path, header=[0, 1]) tm.assert_frame_equal(df, result) # we lose the names here df = _make_frame(True) df.to_csv(path, index=False) result = read_csv(path, header=[0, 1]) assert com.all_none(*result.columns.names) result.columns.names = df.columns.names tm.assert_frame_equal(df, result) # whatsnew example df = _make_frame() df.to_csv(path) result = read_csv(path, header=[0, 1], index_col=[0]) tm.assert_frame_equal(df, result) df = _make_frame(True) df.to_csv(path) result = read_csv(path, header=[0, 1], index_col=[0]) tm.assert_frame_equal(df, result) # invalid options df = _make_frame(True) df.to_csv(path) for i in [6, 7]: msg = "len of {i}, but only 5 lines in file".format(i=i) with pytest.raises(ParserError, match=msg): read_csv(path, header=list(range(i)), index_col=0) # write with cols msg = "cannot specify cols with a MultiIndex" with pytest.raises(TypeError, match=msg): df.to_csv(path, columns=["foo", "bar"]) with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: # empty tsframe[:0].to_csv(path) recons = self.read_csv(path) exp = tsframe[:0] exp.index = [] tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0
def df(request): data_type = request.param if data_type == "delims": return pd.DataFrame({ "a": ['"a,\t"b|c', "d\tef´"], "b": ["hi'j", "k''lm"] }) elif data_type == "utf8": return pd.DataFrame({"a": ["µasd", "Ωœ∑´"], "b": ["øπ∆˚¬", "œ∑´®"]}) elif data_type == "utf16": return pd.DataFrame({ "a": ["\U0001f44d\U0001f44d", "\U0001f44d\U0001f44d"], "b": ["abc", "def"] }) elif data_type == "string": return tm.makeCustomDataframe(5, 3, c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None]) elif data_type == "long": max_rows = get_option("display.max_rows") return tm.makeCustomDataframe( max_rows + 1, 3, data_gen_f=lambda *args: randint(2), c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None], ) elif data_type == "nonascii": return pd.DataFrame({ "en": "in English".split(), "es": "en español".split() }) elif data_type == "colwidth": _cw = get_option("display.max_colwidth") + 1 return tm.makeCustomDataframe( 5, 3, data_gen_f=lambda *args: "x" * _cw, c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None], ) elif data_type == "mixed": return DataFrame({ "a": np.arange(1.0, 6.0) + 0.01, "b": np.arange(1, 6), "c": list("abcde") }) elif data_type == "float": return tm.makeCustomDataframe( 5, 3, data_gen_f=lambda r, c: float(r) + 0.01, c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None], ) elif data_type == "int": return tm.makeCustomDataframe( 5, 3, data_gen_f=lambda *args: randint(2), c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None], ) else: raise ValueError
def test_header_multi_index(self): expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) data = """\ C0,,C_l0_g0,C_l0_g1,C_l0_g2 C1,,C_l1_g0,C_l1_g1,C_l1_g2 C2,,C_l2_g0,C_l2_g1,C_l2_g2 C3,,C_l3_g0,C_l3_g1,C_l3_g2 R0,R1,,, R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], tupleize_cols=False) tm.assert_frame_equal(df, expected) # skipping lines in the header df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], tupleize_cols=False) tm.assert_frame_equal(df, expected) # INVALID OPTIONS # no as_recarray with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], as_recarray=True, tupleize_cols=False) # names pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], names=['foo', 'bar'], tupleize_cols=False) # usecols pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], usecols=['foo', 'bar'], tupleize_cols=False) # non-numeric index_col pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], index_col=['foo', 'bar'], tupleize_cols=False)
def test_dups_fancy_indexing(self): # GH 3455 df = tm.makeCustomDataframe(10, 3) df.columns = ["a", "a", "b"] result = df[["b", "a"]].columns expected = Index(["b", "a", "a"]) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa")) df.head() str(df) result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]]) result.columns = list("aaaaaaa") # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( { "test": [5, 7, 9, 11], "test1": [4.0, 5, 6, 7], "other": list("abcd") }, index=["A", "A", "B", "C"], ) rows = ["C", "B"] expected = DataFrame( { "test": [11, 9], "test1": [7.0, 6], "other": ["d", "c"] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ["C", "B", "E"] expected = DataFrame( { "test": [11, 9, np.nan], "test1": [7.0, 6, np.nan], "other": ["d", "c", np.nan], }, index=rows, ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ["F", "G", "H", "C", "B", "E"] expected = DataFrame( { "test": [np.nan, np.nan, np.nan, 11, 9, np.nan], "test1": [np.nan, np.nan, np.nan, 7.0, 6, np.nan], "other": [np.nan, np.nan, np.nan, "d", "c", np.nan], }, index=rows, ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD")) with pytest.raises( KeyError, match=re.escape( "\"None of [Index(['E'], dtype='object')] are in the [index]\"" ), ): dfnu.loc[["E"]] # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list("abc")}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": ["a", np.nan, "a"]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"]) expected = DataFrame({"test": [5, 7, 5, 7, np.nan]}, index=["A", "A", "A", "A", "E"]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[["A", "A", "E"]] tm.assert_frame_equal(result, expected)
pd.DataFrame({}), pd.DataFrame({"x": [1, 2, 3]}), pd.DataFrame({"x": [1.0, 2.0, 3.0]}), pd.DataFrame({0: [1, 2, 3]}), pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [4.0, 5.0, 6.0]}), pd.DataFrame({"x": [1.0, 2.0, 3.0]}, index=pd.Index([4, 5, 6], name="bar")), pd.Series([1.0, 2.0, 3.0]), pd.Series([1.0, 2.0, 3.0], name="foo"), pd.Series([1.0, 2.0, 3.0], name="foo", index=[4, 5, 6]), pd.Series([1.0, 2.0, 3.0], name="foo", index=pd.Index([4, 5, 6], name="bar")), pd.DataFrame({"x": ["a", "b", "c"]}), pd.DataFrame({"x": [b"a", b"b", b"c"]}), pd.DataFrame({"x": pd.Categorical(["a", "b", "a"], ordered=True)}), pd.DataFrame({"x": pd.Categorical(["a", "b", "a"], ordered=False)}), tm.makeCategoricalIndex(), tm.makeCustomDataframe(5, 3), tm.makeDataFrame(), tm.makeDateIndex(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeObjectSeries(), tm.makePeriodFrame(), tm.makeRangeIndex(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeUnicodeIndex(), ] @pytest.mark.parametrize("df", dfs) def test_dumps_serialize_numpy(df):
def test_select_dtypes_typecodes(self): # GH 11990 df = tm.makeCustomDataframe(30, 3, data_gen_f=lambda x, y: np.random.random()) expected = df FLOAT_TYPES = list(np.typecodes["AllFloat"]) tm.assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)
pd.DataFrame({'x': [1., 2., 3.]}), pd.DataFrame({0: [1, 2, 3]}), pd.DataFrame({'x': [1., 2., 3.], 'y': [4., 5., 6.]}), pd.DataFrame({'x': [1., 2., 3.]}, index=pd.Index([4, 5, 6], name='bar')), pd.Series([1., 2., 3.]), pd.Series([1., 2., 3.], name='foo'), pd.Series([1., 2., 3.], name='foo', index=[4, 5, 6]), pd.Series([1., 2., 3.], name='foo', index=pd.Index([4, 5, 6], name='bar')), pd.DataFrame({'x': ['a', 'b', 'c']}), pd.DataFrame({'x': [b'a', b'b', b'c']}), pd.DataFrame({'x': pd.Categorical(['a', 'b', 'a'], ordered=True)}), pd.DataFrame({'x': pd.Categorical(['a', 'b', 'a'], ordered=False)}), tm.makeCategoricalIndex(), tm.makeCustomDataframe(5, 3), tm.makeDataFrame(), tm.makeDateIndex(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeObjectSeries(), tm.makePeriodFrame(), tm.makeRangeIndex(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeUnicodeIndex(), ] @pytest.mark.parametrize('df', dfs) def test_dumps_serialize_numpy(df):
def test_to_csv_moar(self): def _do_test(df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False): kwargs = dict(parse_dates=False) if cnlvl: if rnlvl is not None: kwargs["index_col"] = list(range(rnlvl)) kwargs["header"] = list(range(cnlvl)) with tm.ensure_clean("__tmp_to_csv_moar__") as path: df.to_csv(path, encoding="utf8", chunksize=chunksize) recons = self.read_csv(path, **kwargs) else: kwargs["header"] = 0 with tm.ensure_clean("__tmp_to_csv_moar__") as path: df.to_csv(path, encoding="utf8", chunksize=chunksize) recons = self.read_csv(path, **kwargs) def _to_uni(x): if not isinstance(x, str): return x.decode("utf8") return x if dupe_col: # read_Csv disambiguates the columns by # labeling them dupe.1,dupe.2, etc'. monkey patch columns recons.columns = df.columns if rnlvl and not cnlvl: delta_lvl = [ recons.iloc[:, i].values for i in range(rnlvl - 1) ] ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) recons.index = ix recons = recons.iloc[:, rnlvl - 1:] type_map = dict(i="i", f="f", s="O", u="O", dt="O", p="O") if r_dtype: if r_dtype == "u": # unicode r_dtype = "O" recons.index = np.array( [_to_uni(label) for label in recons.index], dtype=r_dtype) df.index = np.array([_to_uni(label) for label in df.index], dtype=r_dtype) elif r_dtype == "dt": # unicode r_dtype = "O" recons.index = np.array( [Timestamp(label) for label in recons.index], dtype=r_dtype) df.index = np.array( [Timestamp(label) for label in df.index], dtype=r_dtype) elif r_dtype == "p": r_dtype = "O" idx_list = to_datetime(recons.index) recons.index = np.array( [Timestamp(label) for label in idx_list], dtype=r_dtype) df.index = np.array(list( map(Timestamp, df.index.to_timestamp())), dtype=r_dtype) else: r_dtype = type_map.get(r_dtype) recons.index = np.array(recons.index, dtype=r_dtype) df.index = np.array(df.index, dtype=r_dtype) if c_dtype: if c_dtype == "u": c_dtype = "O" recons.columns = np.array( [_to_uni(label) for label in recons.columns], dtype=c_dtype) df.columns = np.array( [_to_uni(label) for label in df.columns], dtype=c_dtype) elif c_dtype == "dt": c_dtype = "O" recons.columns = np.array( [Timestamp(label) for label in recons.columns], dtype=c_dtype) df.columns = np.array( [Timestamp(label) for label in df.columns], dtype=c_dtype) elif c_dtype == "p": c_dtype = "O" col_list = to_datetime(recons.columns) recons.columns = np.array( [Timestamp(label) for label in col_list], dtype=c_dtype) col_list = df.columns.to_timestamp() df.columns = np.array( [Timestamp(label) for label in col_list], dtype=c_dtype) else: c_dtype = type_map.get(c_dtype) recons.columns = np.array(recons.columns, dtype=c_dtype) df.columns = np.array(df.columns, dtype=c_dtype) tm.assert_frame_equal(df, recons, check_names=False, check_less_precise=True) N = 100 chunksize = 1000 for ncols in [4]: base = int((chunksize // ncols or 1) or 1) for nrows in [ 2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1, ]: _do_test( tm.makeCustomDataframe(nrows, ncols, r_idx_type="dt", c_idx_type="s"), "dt", "s", ) for ncols in [4]: base = int((chunksize // ncols or 1) or 1) for nrows in [ 2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1, ]: _do_test( tm.makeCustomDataframe(nrows, ncols, r_idx_type="dt", c_idx_type="s"), "dt", "s", ) pass for r_idx_type, c_idx_type in [("i", "i"), ("s", "s"), ("u", "dt"), ("p", "p")]: for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) for nrows in [ 2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1, ]: _do_test( tm.makeCustomDataframe(nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type), r_idx_type, c_idx_type, ) for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) for nrows in [ 10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1, ]: _do_test(tm.makeCustomDataframe(nrows, ncols)) for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]: df = tm.makeCustomDataframe(nrows, 3) cols = list(df.columns) cols[:2] = ["dupe", "dupe"] cols[-2:] = ["dupe", "dupe"] ix = list(df.index) ix[:2] = ["rdupe", "rdupe"] ix[-2:] = ["rdupe", "rdupe"] df.index = ix df.columns = cols _do_test(df, dupe_col=True) _do_test(DataFrame(index=np.arange(10))) _do_test(tm.makeCustomDataframe(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2) for ncols in [2, 3, 4]: base = int(chunksize // ncols) for nrows in [ 10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1, ]: _do_test(tm.makeCustomDataframe(nrows, ncols, r_idx_nlevels=2), rnlvl=2) _do_test(tm.makeCustomDataframe(nrows, ncols, c_idx_nlevels=2), cnlvl=2) _do_test( tm.makeCustomDataframe(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2), rnlvl=2, cnlvl=2, )
def mkdf(rows, cols, colnames, **kwargs): df = pdtest.makeCustomDataframe(rows, cols, **kwargs) df.columns = colnames.split() return df