def setUpClass(cls): super(TestClipboard, cls).setUpClass() cls.data = {} cls.data['string'] = mkdf(5, 3, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) cls.data['int'] = mkdf(5, 3, data_gen_f=lambda *args: randint(2), c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) cls.data['float'] = mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) cls.data['mixed'] = DataFrame({'a': np.arange(1.0, 6.0) + 0.01, 'b': np.arange(1, 6), 'c': list('abcde')}) # Test columns exceeding "max_colwidth" (GH8305) _cw = get_option('display.max_colwidth') + 1 cls.data['colwidth'] = mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) # Test GH-5346 max_rows = get_option('display.max_rows') cls.data['longdf'] = mkdf(max_rows + 1, 3, data_gen_f=lambda *args: randint(2), c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) # Test for non-ascii text: GH9263 cls.data['nonascii'] = pd.DataFrame({'en': 'in English'.split(), 'es': 'en español'.split()}) cls.data_types = list(cls.data.keys())
def setup_class(cls): cls.data = {} cls.data['string'] = mkdf(5, 3, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) cls.data['int'] = mkdf(5, 3, data_gen_f=lambda *args: randint(2), c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) cls.data['float'] = mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) cls.data['mixed'] = DataFrame({'a': np.arange(1.0, 6.0) + 0.01, 'b': np.arange(1, 6), 'c': list('abcde')}) # Test columns exceeding "max_colwidth" (GH8305) _cw = get_option('display.max_colwidth') + 1 cls.data['colwidth'] = mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) # Test GH-5346 max_rows = get_option('display.max_rows') cls.data['longdf'] = mkdf(max_rows + 1, 3, data_gen_f=lambda *args: randint(2), c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) # Test for non-ascii text: GH9263 cls.data['nonascii'] = pd.DataFrame({'en': 'in English'.split(), 'es': 'en español'.split()}) # unicode round trip test for GH 13747, GH 12529 cls.data['utf8'] = pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], 'b': ['øπ∆˚¬', 'œ∑´®']}) cls.data_types = list(cls.data.keys())
def test_excel_010_hemstring(self): try: import xlwt import openpyxl except ImportError: raise nose.SkipTest from pandas.util.testing import makeCustomDataframe as mkdf # ensure limited functionality in 0.10 # override of #2370 until sorted out in 0.11 def roundtrip(df, header=True, parser_hdr=0): path = "__tmp__test_xl_010_%s__.xls" % np.random.randint(1, 10000) df.to_excel(path, header=header) xf = pd.ExcelFile(path) try: res = xf.parse(xf.sheet_names[0], header=parser_hdr) return res finally: os.remove(path) nrows = 5 ncols = 3 for i in range(1, 4): # row multindex upto nlevel=3 for j in range(1, 4): # col "" df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) res = roundtrip(df) # shape self.assertEqual(res.shape, (nrows, ncols + i)) # no nans for r in range(len(res.index)): for c in range(len(res.columns)): self.assertTrue(res.ix[r, c] is not np.nan) for i in range(1, 4): # row multindex upto nlevel=3 for j in range(1, 4): # col "" df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) res = roundtrip(df, False) # shape self.assertEqual(res.shape, (nrows - 1, ncols + i)) # first row taken as columns # no nans for r in range(len(res.index)): for c in range(len(res.columns)): self.assertTrue(res.ix[r, c] is not np.nan) res = roundtrip(DataFrame([0])) self.assertEqual(res.shape, (1, 1)) self.assertTrue(res.ix[0, 0] is not np.nan) res = roundtrip(DataFrame([0]), False, None) self.assertEqual(res.shape, (1, 2)) self.assertTrue(res.ix[0, 0] is not np.nan)
def test_excel_010_hemstring(self): _skip_if_no_xlrd() if self.merge_cells: raise nose.SkipTest('Skip tests for merged MI format.') from pandas.util.testing import makeCustomDataframe as mkdf # ensure limited functionality in 0.10 # override of #2370 until sorted out in 0.11 def roundtrip(df, header=True, parser_hdr=0): with ensure_clean(self.ext) as path: df.to_excel(path, header=header, merge_cells=self.merge_cells) xf = pd.ExcelFile(path) res = xf.parse(xf.sheet_names[0], header=parser_hdr) return res nrows = 5 ncols = 3 for i in range(1, 4): # row multindex upto nlevel=3 for j in range(1, 4): # col "" df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) res = roundtrip(df) # shape self.assertEqual(res.shape, (nrows, ncols + i)) # no nans for r in range(len(res.index)): for c in range(len(res.columns)): self.assertTrue(res.ix[r, c] is not np.nan) for i in range(1, 4): # row multindex upto nlevel=3 for j in range(1, 4): # col "" df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) res = roundtrip(df, False) # shape self.assertEqual(res.shape, ( nrows - 1, ncols + i)) # first row taken as columns # no nans for r in range(len(res.index)): for c in range(len(res.columns)): self.assertTrue(res.ix[r, c] is not np.nan) res = roundtrip(DataFrame([0])) self.assertEqual(res.shape, (1, 1)) self.assertTrue(res.ix[0, 0] is not np.nan) res = roundtrip(DataFrame([0]), False, None) self.assertEqual(res.shape, (1, 2)) self.assertTrue(res.ix[0, 0] is not np.nan)
def setUpClass(cls): cls.data = {} cls.data['string'] = mkdf(5, 3, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) cls.data['int'] = mkdf(5, 3, data_gen_f=lambda *args: randint(2), c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) cls.data['float'] = mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) cls.data['mixed'] = DataFrame({'a': np.arange(1.0, 6.0) + 0.01, 'b': np.arange(1, 6), 'c': list('abcde')}) cls.data_types = list(cls.data.keys())
def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels): # see gh-4679 with ensure_clean(ext) as pth: if c_idx_levels == 1 and c_idx_names: pytest.skip("Column index name cannot be " "serialized unless it's a MultiIndex") # Empty name case current read in as # unnamed levels, not Nones. check_names = r_idx_names or r_idx_levels <= 1 df = mkdf(5, 5, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels) df.to_excel(pth) act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels))) tm.assert_frame_equal(df, act, check_names=check_names) df.iloc[0, :] = np.nan df.to_excel(pth) act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels))) tm.assert_frame_equal(df, act, check_names=check_names) df.iloc[-1, :] = np.nan df.to_excel(pth) act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels))) tm.assert_frame_equal(df, act, check_names=check_names)
def test_to_html_compat(self): df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False, r_idx_names=False).applymap('{0:.3f}'.format).astype(float) out = df.to_html() res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0] tm.assert_frame_equal(res, df)
def test_to_csv_legacy_raises_on_dupe_cols(self): df = mkdf(10, 3) df.columns = ['a', 'a', 'b'] with ensure_clean() as path: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self.assertRaises(NotImplementedError, df.to_csv, path, engine='python')
def test_to_html_compat(self): df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False, r_idx_names=False).applymap( "{0:.3f}".format ) out = df.to_html() res = self.run_read_html(out, attrs={"class": "dataframe"}, index_col=0)[0] print df.dtypes print res.dtypes assert_frame_equal(res, df)
def df(request): data_type = request.param if data_type == 'delims': return pd.DataFrame({'a': ['"a,\t"b|c', 'd\tef´'], 'b': ['hi\'j', 'k\'\'lm']}) elif data_type == 'utf8': return pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], 'b': ['øπ∆˚¬', 'œ∑´®']}) elif data_type == 'utf16': return pd.DataFrame({'a': ['\U0001f44d\U0001f44d', '\U0001f44d\U0001f44d'], 'b': ['abc', 'def']}) elif data_type == 'string': return mkdf(5, 3, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) elif data_type == 'long': max_rows = get_option('display.max_rows') return mkdf(max_rows + 1, 3, data_gen_f=lambda *args: randint(2), c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) elif data_type == 'nonascii': return pd.DataFrame({'en': 'in English'.split(), 'es': 'en español'.split()}) elif data_type == 'colwidth': _cw = get_option('display.max_colwidth') + 1 return mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) elif data_type == 'mixed': return DataFrame({'a': np.arange(1.0, 6.0) + 0.01, 'b': np.arange(1, 6), 'c': list('abcde')}) elif data_type == 'float': return mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) elif data_type == 'int': return mkdf(5, 3, data_gen_f=lambda *args: randint(2), c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) else: raise ValueError
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df= mkdf(10, 3) df.columns = ['a','a','b'] cols = ['b','a'] result = df[['b','a']].columns expected = Index(['b','a','a']) self.assert_(result.equals(expected))
def setUpClass(cls): cls.data = {} cls.data['string'] = mkdf(5, 3, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) cls.data['int'] = mkdf(5, 3, data_gen_f=lambda *args: randint(2), c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) cls.data['float'] = mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) cls.data['mixed'] = DataFrame({'a': np.arange(1.0, 6.0) + 0.01, 'b': np.arange(1, 6), 'c': list('abcde')}) # Test GH-5346 max_rows = get_option('display.max_rows') cls.data['longdf'] = mkdf(max_rows+1, 3, data_gen_f=lambda *args: randint(2), c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) cls.data_types = list(cls.data.keys())
def test_loc_empty_list_indexer_is_ok(self): from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(5, 2) # vertical empty tm.assert_frame_equal(df.loc[:, []], df.iloc[:, :0], check_index_type=True, check_column_type=True) # horizontal empty tm.assert_frame_equal(df.loc[[], :], df.iloc[:0, :], check_index_type=True, check_column_type=True) # horizontal empty tm.assert_frame_equal(df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True)
def test_to_csv_dups_cols(self): df = DataFrame(np.random.randn(1000, 30), columns=lrange( 15) + lrange(15), dtype='float64') with ensure_clean() as filename: df.to_csv(filename) # single dtype, fine result = read_csv(filename, index_col=0) result.columns = df.columns assert_frame_equal(result, df) df_float = DataFrame(np.random.randn(1000, 3), dtype='float64') df_int = DataFrame(np.random.randn(1000, 3), dtype='int64') df_bool = DataFrame(True, index=df_float.index, columns=lrange(3)) df_object = DataFrame('foo', index=df_float.index, columns=lrange(3)) df_dt = DataFrame(Timestamp('20010101'), index=df_float.index, columns=lrange(3)) df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True) cols = [] for i in range(5): cols.extend([0, 1, 2]) df.columns = cols from pandas import to_datetime with ensure_clean() as filename: df.to_csv(filename) result = read_csv(filename, index_col=0) # date cols for i in ['0.4', '1.4', '2.4']: result[i] = to_datetime(result[i]) result.columns = df.columns assert_frame_equal(result, df) # GH3457 from pandas.util.testing import makeCustomDataframe as mkdf N = 10 df = mkdf(N, 3) df.columns = ['a', 'a', 'b'] with ensure_clean() as filename: df.to_csv(filename) # read_csv will rename the dups columns result = read_csv(filename, index_col=0) result = result.rename(columns={'a.1': 'a'}) assert_frame_equal(result, df)
def test_concat_invalid_first_argument(self): df1 = mkdf(10, 2) df2 = mkdf(10, 2) self.assertRaises(TypeError, concat, df1, df2) # generator ok though concat(DataFrame(np.random.rand(5, 5)) for _ in range(3)) # text reader ok # GH6583 data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 baz,12,13,14,15 qux,12,13,14,15 foo2,12,13,14,15 bar2,12,13,14,15 """ reader = read_csv(StringIO(data), chunksize=1) result = concat(reader, ignore_index=True) expected = read_csv(StringIO(data)) assert_frame_equal(result, expected)
def test_to_csv_cols_reordering(self): # GH3454 import pandas as pd chunksize = 5 N = int(chunksize * 2.5) df = mkdf(N, 3) cs = df.columns cols = [cs[2], cs[0]] with ensure_clean() as path: df.to_csv(path, columns=cols, chunksize=chunksize) rs_c = pd.read_csv(path, index_col=0) assert_frame_equal(df[cols], rs_c, check_names=False)
def test_excel_010_hemstring(self, merge_cells, engine, ext, c_idx_nlevels, r_idx_nlevels, use_headers): def roundtrip(data, header=True, parser_hdr=0, index=True): data.to_excel(self.path, header=header, merge_cells=merge_cells, index=index) xf = ExcelFile(self.path) return pd.read_excel(xf, xf.sheet_names[0], header=parser_hdr) # Basic test. parser_header = 0 if use_headers else None res = roundtrip(DataFrame([0]), use_headers, parser_header) assert res.shape == (1, 2) assert res.iloc[0, 0] is not np.nan # More complex tests with multi-index. nrows = 5 ncols = 3 from pandas.util.testing import makeCustomDataframe as mkdf # ensure limited functionality in 0.10 # override of gh-2370 until sorted out in 0.11 df = mkdf(nrows, ncols, r_idx_nlevels=r_idx_nlevels, c_idx_nlevels=c_idx_nlevels) # This if will be removed once multi-column Excel writing # is implemented. For now fixing gh-9794. if c_idx_nlevels > 1: with pytest.raises(NotImplementedError): roundtrip(df, use_headers, index=False) else: res = roundtrip(df, use_headers) if use_headers: assert res.shape == (nrows, ncols + r_idx_nlevels) else: # First row taken as columns. assert res.shape == (nrows - 1, ncols + r_idx_nlevels) # No NaNs. for r in range(len(res.index)): for c in range(len(res.columns)): assert res.iloc[r, c] is not np.nan
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(10, 3) df.columns = ["a", "a", "b"] cols = ["b", "a"] result = df[["b", "a"]].columns expected = Index(["b", "a", "a"]) self.assert_(result.equals(expected)) # across dtypes df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa")) df.head() str(df) result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]]) result.columns = list("aaaaaaa") df_v = df.iloc[:, 4] res_v = result.iloc[:, 4] assert_frame_equal(df, result) # GH 3561, dups not in selected order ind = ["A", "A", "B", "C"] df = DataFrame({"test": range(len(ind))}, index=ind) rows = ["C", "B"] res = df.ix[rows] self.assert_(rows == list(res.index)) res = df.ix[Index(rows)] self.assert_(Index(rows).equals(res.index)) rows = ["C", "B", "E"] res = df.ix[rows] self.assert_(rows == list(res.index)) # inconcistent returns for unique/duplicate indices when values are missing df = DataFrame(randn(4, 3), index=list("ABCD")) expected = df.ix[["E"]] dfnu = DataFrame(randn(5, 3), index=list("AABCD")) result = dfnu.ix[["E"]] assert_frame_equal(result, expected)
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df= mkdf(10, 3) df.columns = ['a','a','b'] cols = ['b','a'] result = df[['b','a']].columns expected = Index(['b','a','a']) self.assert_(result.equals(expected)) # across dtypes df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('aaaaaaa')) df.head() str(df) result = DataFrame([[1,2,1.,2.,3.,'foo','bar']]) result.columns = list('aaaaaaa') df_v = df.iloc[:,4] res_v = result.iloc[:,4] assert_frame_equal(df,result) # GH 3561, dups not in selected order ind = ['A', 'A', 'B', 'C'] df = DataFrame({'test':range(len(ind))}, index=ind) rows = ['C', 'B'] res = df.ix[rows] self.assert_(rows == list(res.index)) res = df.ix[Index(rows)] self.assert_(Index(rows).equals(res.index)) rows = ['C','B','E'] res = df.ix[rows] self.assert_(rows == list(res.index)) # inconcistent returns for unique/duplicate indices when values are missing df = DataFrame(randn(4,3),index=list('ABCD')) expected = df.ix[['E']] dfnu = DataFrame(randn(5,3),index=list('AABCD')) result = dfnu.ix[['E']] assert_frame_equal(result, expected)
def test_to_csv_new_dupe_cols(self): import pandas as pd def _check_df(df, cols=None): with ensure_clean() as path: df.to_csv(path, columns=cols, chunksize=chunksize) rs_c = pd.read_csv(path, index_col=0) # we wrote them in a different order # so compare them in that order if cols is not None: if df.columns.is_unique: rs_c.columns = cols else: indexer, missing = df.columns.get_indexer_non_unique( cols) rs_c.columns = df.columns.take(indexer) for c in cols: obj_df = df[c] obj_rs = rs_c[c] if isinstance(obj_df, Series): assert_series_equal(obj_df, obj_rs) else: assert_frame_equal( obj_df, obj_rs, check_names=False) # wrote in the same order else: rs_c.columns = df.columns assert_frame_equal(df, rs_c, check_names=False) chunksize = 5 N = int(chunksize * 2.5) # dupe cols df = mkdf(N, 3) df.columns = ['a', 'a', 'b'] _check_df(df, None) # dupe cols with selection cols = ['b', 'a'] _check_df(df, cols)
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df= mkdf(10, 3) df.columns = ['a','a','b'] cols = ['b','a'] result = df[['b','a']].columns expected = Index(['b','a','a']) self.assert_(result.equals(expected)) # across dtypes df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('aaaaaaa')) df.head() str(df) result = DataFrame([[1,2,1.,2.,3.,'foo','bar']]) result.columns = list('aaaaaaa') df_v = df.iloc[:,4] res_v = result.iloc[:,4] assert_frame_equal(df,result)
def test_query_multiindex_get_index_resolvers(self): df = mkdf(10, 3, r_idx_nlevels=2, r_idx_names=['spam', 'eggs']) resolvers = df._get_index_resolvers() def to_series(mi, level): level_values = mi.get_level_values(level) s = level_values.to_series() s.index = mi return s col_series = df.columns.to_series() expected = {'index': df.index, 'columns': col_series, 'spam': to_series(df.index, 'spam'), 'eggs': to_series(df.index, 'eggs'), 'C0': col_series} for k, v in resolvers.items(): if isinstance(v, Index): assert v.is_(expected[k]) elif isinstance(v, Series): assert_series_equal(v, expected[k]) else: raise AssertionError("object must be a Series or Index")
def test_to_csv_multiindex(self): frame = self.frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) frame.index = new_index with ensure_clean('__tmp_to_csv_multiindex__') as path: frame.to_csv(path, header=False) frame.to_csv(path, columns=['A', 'B']) # round trip frame.to_csv(path) df = DataFrame.from_csv(path, index_col=[0, 1], parse_dates=False) # TODO to_csv drops column name assert_frame_equal(frame, df, check_names=False) self.assertEqual(frame.index.names, df.index.names) # needed if setUP becomes a classmethod self.frame.index = old_index # try multiindex with dates tsframe = self.tsframe old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_csv(path, index_label=['time', 'foo']) recons = DataFrame.from_csv(path, index_col=[0, 1]) # TODO to_csv drops column name assert_frame_equal(tsframe, recons, check_names=False) # do not load index tsframe.to_csv(path) recons = DataFrame.from_csv(path, index_col=None) self.assertEqual(len(recons.columns), len(tsframe.columns) + 2) # no index tsframe.to_csv(path, index=False) recons = DataFrame.from_csv(path, index_col=None) assert_almost_equal(recons.values, self.tsframe.values) # needed if setUP becomes classmethod self.tsframe.index = old_index with ensure_clean('__tmp_to_csv_multiindex__') as path: # GH3571, GH1651, GH3141 def _make_frame(names=None): if names is True: names = ['first', 'second'] return DataFrame(np.random.randint(0, 10, size=(3, 3)), columns=MultiIndex.from_tuples( [('bah', 'foo'), ('bah', 'bar'), ('ban', 'baz')], names=names), dtype='int64') # column & index are multi-index df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1, 2, 3], index_col=[ 0, 1], tupleize_cols=False) assert_frame_equal(df, result) # column is mi df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=False) result = read_csv( path, header=[0, 1, 2, 3], index_col=0, tupleize_cols=False) assert_frame_equal(df, result) # dup column names? df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1, 2, 3], index_col=[ 0, 1, 2], tupleize_cols=False) assert_frame_equal(df, result) # writing with no index df = _make_frame() df.to_csv(path, tupleize_cols=False, index=False) result = read_csv(path, header=[0, 1], tupleize_cols=False) assert_frame_equal(df, result) # we lose the names here df = _make_frame(True) df.to_csv(path, tupleize_cols=False, index=False) result = read_csv(path, header=[0, 1], tupleize_cols=False) self.assertTrue(all([x is None for x in result.columns.names])) result.columns.names = df.columns.names assert_frame_equal(df, result) # tupleize_cols=True and index=False df = _make_frame(True) df.to_csv(path, tupleize_cols=True, index=False) result = read_csv( path, header=0, tupleize_cols=True, index_col=None) result.columns = df.columns assert_frame_equal(df, result) # whatsnew example df = _make_frame() df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1], index_col=[ 0], tupleize_cols=False) assert_frame_equal(df, result) df = _make_frame(True) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1], index_col=[ 0], tupleize_cols=False) assert_frame_equal(df, result) # column & index are multi-index (compatibility) df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=True) result = read_csv(path, header=0, index_col=[ 0, 1], tupleize_cols=True) result.columns = df.columns assert_frame_equal(df, result) # invalid options df = _make_frame(True) df.to_csv(path, tupleize_cols=False) for i in [6, 7]: msg = 'len of {i}, but only 5 lines in file'.format(i=i) with assertRaisesRegexp(CParserError, msg): read_csv(path, tupleize_cols=False, header=lrange(i), index_col=0) # write with cols with assertRaisesRegexp(TypeError, 'cannot specify cols with a ' 'MultiIndex'): df.to_csv(path, tupleize_cols=False, columns=['foo', 'bar']) with ensure_clean('__tmp_to_csv_multiindex__') as path: # empty tsframe[:0].to_csv(path) recons = DataFrame.from_csv(path) exp = tsframe[:0] exp.index = [] self.assert_index_equal(recons.columns, exp.columns) self.assertEqual(len(recons), 0)
def test_to_csv_moar(self): def _do_test(df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False): kwargs = dict(parse_dates=False) if cnlvl: if rnlvl is not None: kwargs['index_col'] = lrange(rnlvl) kwargs['header'] = lrange(cnlvl) with ensure_clean('__tmp_to_csv_moar__') as path: df.to_csv(path, encoding='utf8', chunksize=chunksize, tupleize_cols=False) recons = DataFrame.from_csv( path, tupleize_cols=False, **kwargs) else: kwargs['header'] = 0 with ensure_clean('__tmp_to_csv_moar__') as path: df.to_csv(path, encoding='utf8', chunksize=chunksize) recons = DataFrame.from_csv(path, **kwargs) def _to_uni(x): if not isinstance(x, compat.text_type): return x.decode('utf8') return x if dupe_col: # read_Csv disambiguates the columns by # labeling them dupe.1,dupe.2, etc'. monkey patch columns recons.columns = df.columns if rnlvl and not cnlvl: delta_lvl = [recons.iloc[ :, i].values for i in range(rnlvl - 1)] ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) recons.index = ix recons = recons.iloc[:, rnlvl - 1:] type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O') if r_dtype: if r_dtype == 'u': # unicode r_dtype = 'O' recons.index = np.array(lmap(_to_uni, recons.index), dtype=r_dtype) df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype) elif r_dtype == 'dt': # unicode r_dtype = 'O' recons.index = np.array(lmap(Timestamp, recons.index), dtype=r_dtype) df.index = np.array( lmap(Timestamp, df.index), dtype=r_dtype) elif r_dtype == 'p': r_dtype = 'O' recons.index = np.array( list(map(Timestamp, to_datetime(recons.index))), dtype=r_dtype) df.index = np.array( list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype) else: r_dtype = type_map.get(r_dtype) recons.index = np.array(recons.index, dtype=r_dtype) df.index = np.array(df.index, dtype=r_dtype) if c_dtype: if c_dtype == 'u': c_dtype = 'O' recons.columns = np.array(lmap(_to_uni, recons.columns), dtype=c_dtype) df.columns = np.array( lmap(_to_uni, df.columns), dtype=c_dtype) elif c_dtype == 'dt': c_dtype = 'O' recons.columns = np.array(lmap(Timestamp, recons.columns), dtype=c_dtype) df.columns = np.array( lmap(Timestamp, df.columns), dtype=c_dtype) elif c_dtype == 'p': c_dtype = 'O' recons.columns = np.array( lmap(Timestamp, to_datetime(recons.columns)), dtype=c_dtype) df.columns = np.array( lmap(Timestamp, df.columns.to_timestamp()), dtype=c_dtype) else: c_dtype = type_map.get(c_dtype) recons.columns = np.array(recons.columns, dtype=c_dtype) df.columns = np.array(df.columns, dtype=c_dtype) assert_frame_equal(df, recons, check_names=False, check_less_precise=True) N = 100 chunksize = 1000 for ncols in [4]: base = int((chunksize // ncols or 1) or 1) for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1]: _do_test(mkdf(nrows, ncols, r_idx_type='dt', c_idx_type='s'), 'dt', 's') for ncols in [4]: base = int((chunksize // ncols or 1) or 1) for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1]: _do_test(mkdf(nrows, ncols, r_idx_type='dt', c_idx_type='s'), 'dt', 's') pass for r_idx_type, c_idx_type in [('i', 'i'), ('s', 's'), ('u', 'dt'), ('p', 'p')]: for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1]: _do_test(mkdf(nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type), r_idx_type, c_idx_type) for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1]: _do_test(mkdf(nrows, ncols)) for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]: df = mkdf(nrows, 3) cols = list(df.columns) cols[:2] = ["dupe", "dupe"] cols[-2:] = ["dupe", "dupe"] ix = list(df.index) ix[:2] = ["rdupe", "rdupe"] ix[-2:] = ["rdupe", "rdupe"] df.index = ix df.columns = cols _do_test(df, dupe_col=True) _do_test(DataFrame(index=lrange(10))) _do_test(mkdf(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2) for ncols in [2, 3, 4]: base = int(chunksize // ncols) for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1]: _do_test(mkdf(nrows, ncols, r_idx_nlevels=2), rnlvl=2) _do_test(mkdf(nrows, ncols, c_idx_nlevels=2), cnlvl=2) _do_test(mkdf(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2), rnlvl=2, cnlvl=2)
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(10, 3) df.columns = ['a', 'a', 'b'] result = df[['b', 'a']].columns expected = Index(['b', 'a', 'a']) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('aaaaaaa')) df.head() str(df) result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']]) result.columns = list('aaaaaaa') # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( { 'test': [5, 7, 9, 11], 'test1': [4., 5, 6, 7], 'other': list('abcd') }, index=['A', 'A', 'B', 'C']) rows = ['C', 'B'] expected = DataFrame( { 'test': [11, 9], 'test1': [7., 6], 'other': ['d', 'c'] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ['C', 'B', 'E'] expected = DataFrame( { 'test': [11, 9, np.nan], 'test1': [7., 6, np.nan], 'other': ['d', 'c', np.nan] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ['F', 'G', 'H', 'C', 'B', 'E'] expected = DataFrame( { 'test': [np.nan, np.nan, np.nan, 11, 9, np.nan], 'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan], 'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) # inconsistent returns for unique/duplicate indices when values are # missing df = DataFrame(np.random.randn(4, 3), index=list('ABCD')) expected = df.reindex(['E']) dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) with catch_warnings(record=True): result = dfnu.ix[['E']] tm.assert_frame_equal(result, expected) # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) result = df.loc[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list('abc')}) result = df.loc[[0, 8, 0]] expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) expected = DataFrame({'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) result = df.loc[['A', 'A', 'E']] tm.assert_frame_equal(result, expected) # GH 5835 # dups on index and missing values df = DataFrame(np.random.randn(5, 5), columns=['A', 'B', 'B', 'B', 'A']) expected = pd.concat([ df.loc[:, ['A', 'B']], DataFrame(np.nan, columns=['C'], index=df.index) ], axis=1) result = df.loc[:, ['A', 'B', 'C']] tm.assert_frame_equal(result, expected) # GH 6504, multi-axis indexing df = DataFrame(np.random.randn(9, 2), index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b']) expected = df.iloc[0:6] result = df.loc[[1, 2]] tm.assert_frame_equal(result, expected) expected = df result = df.loc[:, ['a', 'b']] tm.assert_frame_equal(result, expected) expected = df.iloc[0:6, :] result = df.loc[[1, 2], ['a', 'b']] tm.assert_frame_equal(result, expected)
def test_to_csv_moar(self): def _do_test(df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False): kwargs = dict(parse_dates=False) if cnlvl: if rnlvl is not None: kwargs['index_col'] = lrange(rnlvl) kwargs['header'] = lrange(cnlvl) with ensure_clean('__tmp_to_csv_moar__') as path: df.to_csv(path, encoding='utf8', chunksize=chunksize, tupleize_cols=False) recons = self.read_csv(path, tupleize_cols=False, **kwargs) else: kwargs['header'] = 0 with ensure_clean('__tmp_to_csv_moar__') as path: df.to_csv(path, encoding='utf8', chunksize=chunksize) recons = self.read_csv(path, **kwargs) def _to_uni(x): if not isinstance(x, compat.text_type): return x.decode('utf8') return x if dupe_col: # read_Csv disambiguates the columns by # labeling them dupe.1,dupe.2, etc'. monkey patch columns recons.columns = df.columns if rnlvl and not cnlvl: delta_lvl = [ recons.iloc[:, i].values for i in range(rnlvl - 1) ] ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) recons.index = ix recons = recons.iloc[:, rnlvl - 1:] type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O') if r_dtype: if r_dtype == 'u': # unicode r_dtype = 'O' recons.index = np.array(lmap(_to_uni, recons.index), dtype=r_dtype) df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype) elif r_dtype == 'dt': # unicode r_dtype = 'O' recons.index = np.array(lmap(Timestamp, recons.index), dtype=r_dtype) df.index = np.array(lmap(Timestamp, df.index), dtype=r_dtype) elif r_dtype == 'p': r_dtype = 'O' recons.index = np.array(list( map(Timestamp, to_datetime(recons.index))), dtype=r_dtype) df.index = np.array(list( map(Timestamp, df.index.to_timestamp())), dtype=r_dtype) else: r_dtype = type_map.get(r_dtype) recons.index = np.array(recons.index, dtype=r_dtype) df.index = np.array(df.index, dtype=r_dtype) if c_dtype: if c_dtype == 'u': c_dtype = 'O' recons.columns = np.array(lmap(_to_uni, recons.columns), dtype=c_dtype) df.columns = np.array(lmap(_to_uni, df.columns), dtype=c_dtype) elif c_dtype == 'dt': c_dtype = 'O' recons.columns = np.array(lmap(Timestamp, recons.columns), dtype=c_dtype) df.columns = np.array(lmap(Timestamp, df.columns), dtype=c_dtype) elif c_dtype == 'p': c_dtype = 'O' recons.columns = np.array(lmap(Timestamp, to_datetime( recons.columns)), dtype=c_dtype) df.columns = np.array(lmap(Timestamp, df.columns.to_timestamp()), dtype=c_dtype) else: c_dtype = type_map.get(c_dtype) recons.columns = np.array(recons.columns, dtype=c_dtype) df.columns = np.array(df.columns, dtype=c_dtype) assert_frame_equal(df, recons, check_names=False, check_less_precise=True) N = 100 chunksize = 1000 for ncols in [4]: base = int((chunksize // ncols or 1) or 1) for nrows in [ 2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1 ]: _do_test(mkdf(nrows, ncols, r_idx_type='dt', c_idx_type='s'), 'dt', 's') for ncols in [4]: base = int((chunksize // ncols or 1) or 1) for nrows in [ 2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1 ]: _do_test(mkdf(nrows, ncols, r_idx_type='dt', c_idx_type='s'), 'dt', 's') pass for r_idx_type, c_idx_type in [('i', 'i'), ('s', 's'), ('u', 'dt'), ('p', 'p')]: for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) for nrows in [ 2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1 ]: _do_test( mkdf(nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type), r_idx_type, c_idx_type) for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) for nrows in [ 10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1 ]: _do_test(mkdf(nrows, ncols)) for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]: df = mkdf(nrows, 3) cols = list(df.columns) cols[:2] = ["dupe", "dupe"] cols[-2:] = ["dupe", "dupe"] ix = list(df.index) ix[:2] = ["rdupe", "rdupe"] ix[-2:] = ["rdupe", "rdupe"] df.index = ix df.columns = cols _do_test(df, dupe_col=True) _do_test(DataFrame(index=lrange(10))) _do_test(mkdf(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2) for ncols in [2, 3, 4]: base = int(chunksize // ncols) for nrows in [ 10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1 ]: _do_test(mkdf(nrows, ncols, r_idx_nlevels=2), rnlvl=2) _do_test(mkdf(nrows, ncols, c_idx_nlevels=2), cnlvl=2) _do_test(mkdf(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2), rnlvl=2, cnlvl=2)
def test_select_dtypes_typecodes(self): # GH 11990 df = mkdf(30, 3, data_gen_f=lambda x, y: np.random.random()) expected = df FLOAT_TYPES = list(np.typecodes['AllFloat']) assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)
def test_to_csv_moar(self): def _do_test(df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False): kwargs = dict(parse_dates=False) if cnlvl: if rnlvl is not None: kwargs["index_col"] = list(range(rnlvl)) kwargs["header"] = list(range(cnlvl)) with ensure_clean("__tmp_to_csv_moar__") as path: df.to_csv(path, encoding="utf8", chunksize=chunksize) recons = self.read_csv(path, **kwargs) else: kwargs["header"] = 0 with ensure_clean("__tmp_to_csv_moar__") as path: df.to_csv(path, encoding="utf8", chunksize=chunksize) recons = self.read_csv(path, **kwargs) def _to_uni(x): if not isinstance(x, str): return x.decode("utf8") return x if dupe_col: # read_Csv disambiguates the columns by # labeling them dupe.1,dupe.2, etc'. monkey patch columns recons.columns = df.columns if rnlvl and not cnlvl: delta_lvl = [ recons.iloc[:, i].values for i in range(rnlvl - 1) ] ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) recons.index = ix recons = recons.iloc[:, rnlvl - 1:] type_map = dict(i="i", f="f", s="O", u="O", dt="O", p="O") if r_dtype: if r_dtype == "u": # unicode r_dtype = "O" recons.index = np.array( [_to_uni(label) for label in recons.index], dtype=r_dtype) df.index = np.array([_to_uni(label) for label in df.index], dtype=r_dtype) elif r_dtype == "dt": # unicode r_dtype = "O" recons.index = np.array( [Timestamp(label) for label in recons.index], dtype=r_dtype) df.index = np.array( [Timestamp(label) for label in df.index], dtype=r_dtype) elif r_dtype == "p": r_dtype = "O" idx_list = to_datetime(recons.index) recons.index = np.array( [Timestamp(label) for label in idx_list], dtype=r_dtype) df.index = np.array(list( map(Timestamp, df.index.to_timestamp())), dtype=r_dtype) else: r_dtype = type_map.get(r_dtype) recons.index = np.array(recons.index, dtype=r_dtype) df.index = np.array(df.index, dtype=r_dtype) if c_dtype: if c_dtype == "u": c_dtype = "O" recons.columns = np.array( [_to_uni(label) for label in recons.columns], dtype=c_dtype) df.columns = np.array( [_to_uni(label) for label in df.columns], dtype=c_dtype) elif c_dtype == "dt": c_dtype = "O" recons.columns = np.array( [Timestamp(label) for label in recons.columns], dtype=c_dtype) df.columns = np.array( [Timestamp(label) for label in df.columns], dtype=c_dtype) elif c_dtype == "p": c_dtype = "O" col_list = to_datetime(recons.columns) recons.columns = np.array( [Timestamp(label) for label in col_list], dtype=c_dtype) col_list = df.columns.to_timestamp() df.columns = np.array( [Timestamp(label) for label in col_list], dtype=c_dtype) else: c_dtype = type_map.get(c_dtype) recons.columns = np.array(recons.columns, dtype=c_dtype) df.columns = np.array(df.columns, dtype=c_dtype) assert_frame_equal(df, recons, check_names=False, check_less_precise=True) N = 100 chunksize = 1000 for ncols in [4]: base = int((chunksize // ncols or 1) or 1) for nrows in [ 2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1, ]: _do_test(mkdf(nrows, ncols, r_idx_type="dt", c_idx_type="s"), "dt", "s") for ncols in [4]: base = int((chunksize // ncols or 1) or 1) for nrows in [ 2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1, ]: _do_test(mkdf(nrows, ncols, r_idx_type="dt", c_idx_type="s"), "dt", "s") pass for r_idx_type, c_idx_type in [("i", "i"), ("s", "s"), ("u", "dt"), ("p", "p")]: for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) for nrows in [ 2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1, ]: _do_test( mkdf(nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type), r_idx_type, c_idx_type, ) for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) for nrows in [ 10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1, ]: _do_test(mkdf(nrows, ncols)) for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]: df = mkdf(nrows, 3) cols = list(df.columns) cols[:2] = ["dupe", "dupe"] cols[-2:] = ["dupe", "dupe"] ix = list(df.index) ix[:2] = ["rdupe", "rdupe"] ix[-2:] = ["rdupe", "rdupe"] df.index = ix df.columns = cols _do_test(df, dupe_col=True) _do_test(DataFrame(index=np.arange(10))) _do_test(mkdf(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2) for ncols in [2, 3, 4]: base = int(chunksize // ncols) for nrows in [ 10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1, ]: _do_test(mkdf(nrows, ncols, r_idx_nlevels=2), rnlvl=2) _do_test(mkdf(nrows, ncols, c_idx_nlevels=2), cnlvl=2) _do_test( mkdf(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2), rnlvl=2, cnlvl=2, )
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(10, 3) df.columns = ['a', 'a', 'b'] result = df[['b', 'a']].columns expected = Index(['b', 'a', 'a']) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('aaaaaaa')) df.head() str(df) result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']]) result.columns = list('aaaaaaa') # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( { 'test': [5, 7, 9, 11], 'test1': [4., 5, 6, 7], 'other': list('abcd') }, index=['A', 'A', 'B', 'C']) rows = ['C', 'B'] expected = DataFrame( { 'test': [11, 9], 'test1': [7., 6], 'other': ['d', 'c'] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ['C', 'B', 'E'] expected = DataFrame( { 'test': [11, 9, np.nan], 'test1': [7., 6, np.nan], 'other': ['d', 'c', np.nan] }, index=rows) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ['F', 'G', 'H', 'C', 'B', 'E'] expected = DataFrame( { 'test': [np.nan, np.nan, np.nan, 11, 9, np.nan], 'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan], 'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan] }, index=rows) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) with pytest.raises(KeyError): dfnu.loc[['E']] # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list('abc')}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) expected = DataFrame({'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[['A', 'A', 'E']] tm.assert_frame_equal(result, expected)
def my_repr(df): return UrlDisplay(server.get_view_url("dfView", df), "350px")._repr_html_() # monkey patch pandas to override it's default HTML repr. This could also # be done # upstream as part of pandas itself. # a cleaner way would be to use IPNB type-based display hooking # google "ipython formatters for_type" # or see # http://ipython.org/ipython-doc/stable/api/generated/IPython.core.formatters.html # print fancy link/embedded HTML in qtconsole/ipnb pd.DataFrame._repr_html_ = my_repr # now, displaying dataframes in IPython-notebook will open up # an IFRAME with the grid view df=mkdf(5000,10) # now we can display the dataframe from our python prompt # and view the url or rendered HTML # >>> df # try to modify the datdrame inplace, and refresh the grid # with the bottom left-hand button # df.ix[0,0]="pooh" # when you're done, shutdown the server to release the socket # server.stop()
def df(request): data_type = request.param if data_type == 'delims': return pd.DataFrame({ 'a': ['"a,\t"b|c', 'd\tef´'], 'b': ['hi\'j', 'k\'\'lm'] }) elif data_type == 'utf8': return pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], 'b': ['øπ∆˚¬', 'œ∑´®']}) elif data_type == 'string': return mkdf(5, 3, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) elif data_type == 'long': max_rows = get_option('display.max_rows') return mkdf(max_rows + 1, 3, data_gen_f=lambda *args: randint(2), c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) elif data_type == 'nonascii': return pd.DataFrame({ 'en': 'in English'.split(), 'es': 'en español'.split() }) elif data_type == 'colwidth': _cw = get_option('display.max_colwidth') + 1 return mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) elif data_type == 'mixed': return DataFrame({ 'a': np.arange(1.0, 6.0) + 0.01, 'b': np.arange(1, 6), 'c': list('abcde') }) elif data_type == 'float': return mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01, c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) elif data_type == 'int': return mkdf(5, 3, data_gen_f=lambda *args: randint(2), c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) else: raise ValueError
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(10, 3) df.columns = ["a", "a", "b"] result = df[["b", "a"]].columns expected = Index(["b", "a", "a"]) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa")) df.head() str(df) result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]]) result.columns = list("aaaaaaa") # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( {"test": [5, 7, 9, 11], "test1": [4.0, 5, 6, 7], "other": list("abcd")}, index=["A", "A", "B", "C"], ) rows = ["C", "B"] expected = DataFrame( {"test": [11, 9], "test1": [7.0, 6], "other": ["d", "c"]}, index=rows ) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ["C", "B", "E"] expected = DataFrame( { "test": [11, 9, np.nan], "test1": [7.0, 6, np.nan], "other": ["d", "c", np.nan], }, index=rows, ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ["F", "G", "H", "C", "B", "E"] expected = DataFrame( { "test": [np.nan, np.nan, np.nan, 11, 9, np.nan], "test1": [np.nan, np.nan, np.nan, 7.0, 6, np.nan], "other": [np.nan, np.nan, np.nan, "d", "c", np.nan], }, index=rows, ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD")) with pytest.raises( KeyError, match=re.escape( "\"None of [Index(['E'], dtype='object')] are in the [index]\"" ), ): dfnu.loc[["E"]] # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list("abc")}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": ["a", np.nan, "a"]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"]) expected = DataFrame( {"test": [5, 7, 5, 7, np.nan]}, index=["A", "A", "A", "A", "E"] ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[["A", "A", "E"]] tm.assert_frame_equal(result, expected)
def test_concat_invalid(self): # trying to concat a ndframe with a non-ndframe df1 = mkdf(10, 2) for obj in [1, dict(), [1, 2], (1, 2)]: self.assertRaises(TypeError, lambda x: concat([df1, obj]))
def test_to_csv_multiindex(self): pname = '__tmp_to_csv_multiindex__' frame = self.frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) frame.index = new_index with ensure_clean(pname) as path: frame.to_csv(path, header=False) frame.to_csv(path, columns=['A', 'B']) # round trip frame.to_csv(path) df = DataFrame.from_csv(path, index_col=[0, 1], parse_dates=False) # TODO to_csv drops column name assert_frame_equal(frame, df, check_names=False) self.assertEqual(frame.index.names, df.index.names) # needed if setUP becomes a classmethod self.frame.index = old_index # try multiindex with dates tsframe = self.tsframe old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_csv(path, index_label=['time', 'foo']) recons = DataFrame.from_csv(path, index_col=[0, 1]) # TODO to_csv drops column name assert_frame_equal(tsframe, recons, check_names=False) # do not load index tsframe.to_csv(path) recons = DataFrame.from_csv(path, index_col=None) np.testing.assert_equal( len(recons.columns), len(tsframe.columns) + 2) # no index tsframe.to_csv(path, index=False) recons = DataFrame.from_csv(path, index_col=None) assert_almost_equal(recons.values, self.tsframe.values) # needed if setUP becomes classmethod self.tsframe.index = old_index with ensure_clean(pname) as path: # GH3571, GH1651, GH3141 def _make_frame(names=None): if names is True: names = ['first', 'second'] return DataFrame(np.random.randint(0, 10, size=(3, 3)), columns=MultiIndex.from_tuples( [('bah', 'foo'), ('bah', 'bar'), ('ban', 'baz')], names=names), dtype='int64') # column & index are multi-index df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1, 2, 3], index_col=[ 0, 1], tupleize_cols=False) assert_frame_equal(df, result) # column is mi df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=False) result = read_csv( path, header=[0, 1, 2, 3], index_col=0, tupleize_cols=False) assert_frame_equal(df, result) # dup column names? df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1, 2, 3], index_col=[ 0, 1, 2], tupleize_cols=False) assert_frame_equal(df, result) # writing with no index df = _make_frame() df.to_csv(path, tupleize_cols=False, index=False) result = read_csv(path, header=[0, 1], tupleize_cols=False) assert_frame_equal(df, result) # we lose the names here df = _make_frame(True) df.to_csv(path, tupleize_cols=False, index=False) result = read_csv(path, header=[0, 1], tupleize_cols=False) self.assertTrue(all([x is None for x in result.columns.names])) result.columns.names = df.columns.names assert_frame_equal(df, result) # tupleize_cols=True and index=False df = _make_frame(True) df.to_csv(path, tupleize_cols=True, index=False) result = read_csv( path, header=0, tupleize_cols=True, index_col=None) result.columns = df.columns assert_frame_equal(df, result) # whatsnew example df = _make_frame() df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1], index_col=[ 0], tupleize_cols=False) assert_frame_equal(df, result) df = _make_frame(True) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1], index_col=[ 0], tupleize_cols=False) assert_frame_equal(df, result) # column & index are multi-index (compatibility) df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=True) result = read_csv(path, header=0, index_col=[ 0, 1], tupleize_cols=True) result.columns = df.columns assert_frame_equal(df, result) # invalid options df = _make_frame(True) df.to_csv(path, tupleize_cols=False) # catch invalid headers with assertRaisesRegexp(CParserError, 'Passed header=\[0,1,2\] are too many ' 'rows for this multi_index of columns'): read_csv(path, tupleize_cols=False, header=lrange(3), index_col=0) with assertRaisesRegexp(CParserError, 'Passed header=\[0,1,2,3,4,5,6\], len of ' '7, but only 6 lines in file'): read_csv(path, tupleize_cols=False, header=lrange(7), index_col=0) for i in [4, 5, 6]: with tm.assertRaises(CParserError): read_csv(path, tupleize_cols=False, header=lrange(i), index_col=0) # write with cols with assertRaisesRegexp(TypeError, 'cannot specify cols with a ' 'MultiIndex'): df.to_csv(path, tupleize_cols=False, columns=['foo', 'bar']) with ensure_clean(pname) as path: # empty tsframe[:0].to_csv(path) recons = DataFrame.from_csv(path) exp = tsframe[:0] exp.index = [] self.assertTrue(recons.columns.equals(exp.columns)) self.assertEqual(len(recons), 0)
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(10, 3) df.columns = ['a', 'a', 'b'] result = df[['b', 'a']].columns expected = Index(['b', 'a', 'a']) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('aaaaaaa')) df.head() str(df) result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']]) result.columns = list('aaaaaaa') # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( {'test': [5, 7, 9, 11], 'test1': [4., 5, 6, 7], 'other': list('abcd')}, index=['A', 'A', 'B', 'C']) rows = ['C', 'B'] expected = DataFrame( {'test': [11, 9], 'test1': [7., 6], 'other': ['d', 'c']}, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ['C', 'B', 'E'] expected = DataFrame( {'test': [11, 9, np.nan], 'test1': [7., 6, np.nan], 'other': ['d', 'c', np.nan]}, index=rows) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ['F', 'G', 'H', 'C', 'B', 'E'] expected = DataFrame({'test': [np.nan, np.nan, np.nan, 11, 9, np.nan], 'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan], 'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan]}, index=rows) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) with pytest.raises(KeyError): dfnu.loc[['E']] # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list('abc')}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) expected = DataFrame( {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[['A', 'A', 'E']] tm.assert_frame_equal(result, expected)
def test_to_csv_moar(self): path = '__tmp_to_csv_moar__' def _do_test(df, path, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False): kwargs = dict(parse_dates=False) if cnlvl: if rnlvl is not None: kwargs['index_col'] = lrange(rnlvl) kwargs['header'] = lrange(cnlvl) with ensure_clean(path) as path: df.to_csv(path, encoding='utf8', chunksize=chunksize, tupleize_cols=False) recons = DataFrame.from_csv( path, tupleize_cols=False, **kwargs) else: kwargs['header'] = 0 with ensure_clean(path) as path: df.to_csv(path, encoding='utf8', chunksize=chunksize) recons = DataFrame.from_csv(path, **kwargs) def _to_uni(x): if not isinstance(x, compat.text_type): return x.decode('utf8') return x if dupe_col: # read_Csv disambiguates the columns by # labeling them dupe.1,dupe.2, etc'. monkey patch columns recons.columns = df.columns if rnlvl and not cnlvl: delta_lvl = [recons.iloc[ :, i].values for i in range(rnlvl - 1)] ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) recons.index = ix recons = recons.iloc[:, rnlvl - 1:] type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O') if r_dtype: if r_dtype == 'u': # unicode r_dtype = 'O' recons.index = np.array(lmap(_to_uni, recons.index), dtype=r_dtype) df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype) elif r_dtype == 'dt': # unicode r_dtype = 'O' recons.index = np.array(lmap(Timestamp, recons.index), dtype=r_dtype) df.index = np.array( lmap(Timestamp, df.index), dtype=r_dtype) elif r_dtype == 'p': r_dtype = 'O' recons.index = np.array( list(map(Timestamp, recons.index.to_datetime())), dtype=r_dtype) df.index = np.array( list(map(Timestamp, df.index.to_datetime())), dtype=r_dtype) else: r_dtype = type_map.get(r_dtype) recons.index = np.array(recons.index, dtype=r_dtype) df.index = np.array(df.index, dtype=r_dtype) if c_dtype: if c_dtype == 'u': c_dtype = 'O' recons.columns = np.array(lmap(_to_uni, recons.columns), dtype=c_dtype) df.columns = np.array( lmap(_to_uni, df.columns), dtype=c_dtype) elif c_dtype == 'dt': c_dtype = 'O' recons.columns = np.array(lmap(Timestamp, recons.columns), dtype=c_dtype) df.columns = np.array( lmap(Timestamp, df.columns), dtype=c_dtype) elif c_dtype == 'p': c_dtype = 'O' recons.columns = np.array( lmap(Timestamp, recons.columns.to_datetime()), dtype=c_dtype) df.columns = np.array( lmap(Timestamp, df.columns.to_datetime()), dtype=c_dtype) else: c_dtype = type_map.get(c_dtype) recons.columns = np.array(recons.columns, dtype=c_dtype) df.columns = np.array(df.columns, dtype=c_dtype) assert_frame_equal(df, recons, check_names=False, check_less_precise=True) N = 100 chunksize = 1000 # GH3437 from pandas import NaT def make_dtnat_arr(n, nnat=None): if nnat is None: nnat = int(n * 0.1) # 10% s = list(date_range('2000', freq='5min', periods=n)) if nnat: for i in np.random.randint(0, len(s), nnat): s[i] = NaT i = np.random.randint(100) s[-i] = NaT s[i] = NaT return s # N=35000 s1 = make_dtnat_arr(chunksize + 5) s2 = make_dtnat_arr(chunksize + 5, 0) path = '1.csv' # s3=make_dtnjat_arr(chunksize+5,0) with ensure_clean('.csv') as pth: df = DataFrame(dict(a=s1, b=s2)) df.to_csv(pth, chunksize=chunksize) recons = DataFrame.from_csv(pth)._convert(datetime=True, coerce=True) assert_frame_equal(df, recons, check_names=False, check_less_precise=True) for ncols in [4]: base = int((chunksize // ncols or 1) or 1) for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1]: _do_test(mkdf(nrows, ncols, r_idx_type='dt', c_idx_type='s'), path, 'dt', 's') for ncols in [4]: base = int((chunksize // ncols or 1) or 1) for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1]: _do_test(mkdf(nrows, ncols, r_idx_type='dt', c_idx_type='s'), path, 'dt', 's') pass for r_idx_type, c_idx_type in [('i', 'i'), ('s', 's'), ('u', 'dt'), ('p', 'p')]: for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1]: _do_test(mkdf(nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type), path, r_idx_type, c_idx_type) for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1]: _do_test(mkdf(nrows, ncols), path) for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]: df = mkdf(nrows, 3) cols = list(df.columns) cols[:2] = ["dupe", "dupe"] cols[-2:] = ["dupe", "dupe"] ix = list(df.index) ix[:2] = ["rdupe", "rdupe"] ix[-2:] = ["rdupe", "rdupe"] df.index = ix df.columns = cols _do_test(df, path, dupe_col=True) _do_test(DataFrame(index=lrange(10)), path) _do_test(mkdf(chunksize // 2 + 1, 2, r_idx_nlevels=2), path, rnlvl=2) for ncols in [2, 3, 4]: base = int(chunksize // ncols) for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1]: _do_test(mkdf(nrows, ncols, r_idx_nlevels=2), path, rnlvl=2) _do_test(mkdf(nrows, ncols, c_idx_nlevels=2), path, cnlvl=2) _do_test(mkdf(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2), path, rnlvl=2, cnlvl=2)
def test_to_csv_multiindex(self): frame = self.frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index with ensure_clean("__tmp_to_csv_multiindex__") as path: frame.to_csv(path, header=False) frame.to_csv(path, columns=["A", "B"]) # round trip frame.to_csv(path) df = self.read_csv(path, index_col=[0, 1], parse_dates=False) # TODO to_csv drops column name assert_frame_equal(frame, df, check_names=False) assert frame.index.names == df.index.names # needed if setUp becomes a class method self.frame.index = old_index # try multiindex with dates tsframe = self.tsframe old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_csv(path, index_label=["time", "foo"]) recons = self.read_csv(path, index_col=[0, 1]) # TODO to_csv drops column name assert_frame_equal(tsframe, recons, check_names=False) # do not load index tsframe.to_csv(path) recons = self.read_csv(path, index_col=None) assert len(recons.columns) == len(tsframe.columns) + 2 # no index tsframe.to_csv(path, index=False) recons = self.read_csv(path, index_col=None) assert_almost_equal(recons.values, self.tsframe.values) # needed if setUp becomes class method self.tsframe.index = old_index with ensure_clean("__tmp_to_csv_multiindex__") as path: # GH3571, GH1651, GH3141 def _make_frame(names=None): if names is True: names = ["first", "second"] return DataFrame( np.random.randint(0, 10, size=(3, 3)), columns=MultiIndex.from_tuples([("bah", "foo"), ("bah", "bar"), ("ban", "baz")], names=names), dtype="int64", ) # column & index are multi-index df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1]) assert_frame_equal(df, result) # column is mi df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=0) assert_frame_equal(df, result) # dup column names? df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2]) assert_frame_equal(df, result) # writing with no index df = _make_frame() df.to_csv(path, index=False) result = read_csv(path, header=[0, 1]) assert_frame_equal(df, result) # we lose the names here df = _make_frame(True) df.to_csv(path, index=False) result = read_csv(path, header=[0, 1]) assert com._all_none(*result.columns.names) result.columns.names = df.columns.names assert_frame_equal(df, result) # whatsnew example df = _make_frame() df.to_csv(path) result = read_csv(path, header=[0, 1], index_col=[0]) assert_frame_equal(df, result) df = _make_frame(True) df.to_csv(path) result = read_csv(path, header=[0, 1], index_col=[0]) assert_frame_equal(df, result) # invalid options df = _make_frame(True) df.to_csv(path) for i in [6, 7]: msg = "len of {i}, but only 5 lines in file".format(i=i) with pytest.raises(ParserError, match=msg): read_csv(path, header=list(range(i)), index_col=0) # write with cols msg = "cannot specify cols with a MultiIndex" with pytest.raises(TypeError, match=msg): df.to_csv(path, columns=["foo", "bar"]) with ensure_clean("__tmp_to_csv_multiindex__") as path: # empty tsframe[:0].to_csv(path) recons = self.read_csv(path) exp = tsframe[:0] exp.index = [] tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(10, 3) df.columns = ['a', 'a', 'b'] result = df[['b', 'a']].columns expected = Index(['b', 'a', 'a']) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('aaaaaaa')) df.head() str(df) result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']]) result.columns = list('aaaaaaa') # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( {'test': [5, 7, 9, 11], 'test1': [4., 5, 6, 7], 'other': list('abcd')}, index=['A', 'A', 'B', 'C']) rows = ['C', 'B'] expected = DataFrame( {'test': [11, 9], 'test1': [7., 6], 'other': ['d', 'c']}, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ['C', 'B', 'E'] expected = DataFrame( {'test': [11, 9, np.nan], 'test1': [7., 6, np.nan], 'other': ['d', 'c', np.nan]}, index=rows) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ['F', 'G', 'H', 'C', 'B', 'E'] expected = DataFrame({'test': [np.nan, np.nan, np.nan, 11, 9, np.nan], 'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan], 'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan]}, index=rows) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # inconsistent returns for unique/duplicate indices when values are # missing df = DataFrame(np.random.randn(4, 3), index=list('ABCD')) expected = df.reindex(['E']) dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) with catch_warnings(record=True): result = dfnu.ix[['E']] tm.assert_frame_equal(result, expected) # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list('abc')}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) expected = DataFrame( {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[['A', 'A', 'E']] tm.assert_frame_equal(result, expected) # GH 5835 # dups on index and missing values df = DataFrame( np.random.randn(5, 5), columns=['A', 'B', 'B', 'B', 'A']) expected = pd.concat( [df.loc[:, ['A', 'B']], DataFrame(np.nan, columns=['C'], index=df.index)], axis=1) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[:, ['A', 'B', 'C']] tm.assert_frame_equal(result, expected) # GH 6504, multi-axis indexing df = DataFrame(np.random.randn(9, 2), index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b']) expected = df.iloc[0:6] result = df.loc[[1, 2]] tm.assert_frame_equal(result, expected) expected = df result = df.loc[:, ['a', 'b']] tm.assert_frame_equal(result, expected) expected = df.iloc[0:6, :] result = df.loc[[1, 2], ['a', 'b']] tm.assert_frame_equal(result, expected)
def test_to_csv_multiindex(self): frame = self.frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) frame.index = new_index with ensure_clean('__tmp_to_csv_multiindex__') as path: frame.to_csv(path, header=False) frame.to_csv(path, columns=['A', 'B']) # round trip frame.to_csv(path) df = self.read_csv(path, index_col=[0, 1], parse_dates=False) # TODO to_csv drops column name assert_frame_equal(frame, df, check_names=False) assert frame.index.names == df.index.names # needed if setUp becomes a class method self.frame.index = old_index # try multiindex with dates tsframe = self.tsframe old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_csv(path, index_label=['time', 'foo']) recons = self.read_csv(path, index_col=[0, 1]) # TODO to_csv drops column name assert_frame_equal(tsframe, recons, check_names=False) # do not load index tsframe.to_csv(path) recons = self.read_csv(path, index_col=None) assert len(recons.columns) == len(tsframe.columns) + 2 # no index tsframe.to_csv(path, index=False) recons = self.read_csv(path, index_col=None) assert_almost_equal(recons.values, self.tsframe.values) # needed if setUp becomes class method self.tsframe.index = old_index with ensure_clean('__tmp_to_csv_multiindex__') as path: # GH3571, GH1651, GH3141 def _make_frame(names=None): if names is True: names = ['first', 'second'] return DataFrame(np.random.randint(0, 10, size=(3, 3)), columns=MultiIndex.from_tuples( [('bah', 'foo'), ('bah', 'bar'), ('ban', 'baz')], names=names), dtype='int64') # column & index are multi-index df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1], tupleize_cols=False) assert_frame_equal(df, result) # column is mi df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1, 2, 3], index_col=0, tupleize_cols=False) assert_frame_equal(df, result) # dup column names? df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2], tupleize_cols=False) assert_frame_equal(df, result) # writing with no index df = _make_frame() df.to_csv(path, tupleize_cols=False, index=False) result = read_csv(path, header=[0, 1], tupleize_cols=False) assert_frame_equal(df, result) # we lose the names here df = _make_frame(True) df.to_csv(path, tupleize_cols=False, index=False) result = read_csv(path, header=[0, 1], tupleize_cols=False) assert _all_none(*result.columns.names) result.columns.names = df.columns.names assert_frame_equal(df, result) # tupleize_cols=True and index=False df = _make_frame(True) df.to_csv(path, tupleize_cols=True, index=False) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = read_csv(path, header=0, tupleize_cols=True, index_col=None) result.columns = df.columns assert_frame_equal(df, result) # whatsnew example df = _make_frame() df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1], index_col=[0], tupleize_cols=False) assert_frame_equal(df, result) df = _make_frame(True) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1], index_col=[0], tupleize_cols=False) assert_frame_equal(df, result) # column & index are multi-index (compatibility) df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=True) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = read_csv(path, header=0, index_col=[0, 1], tupleize_cols=True) result.columns = df.columns assert_frame_equal(df, result) # invalid options df = _make_frame(True) df.to_csv(path, tupleize_cols=False) for i in [6, 7]: msg = 'len of {i}, but only 5 lines in file'.format(i=i) with tm.assert_raises_regex(ParserError, msg): read_csv(path, tupleize_cols=False, header=lrange(i), index_col=0) # write with cols with tm.assert_raises_regex( TypeError, 'cannot specify cols ' 'with a MultiIndex'): df.to_csv(path, tupleize_cols=False, columns=['foo', 'bar']) with ensure_clean('__tmp_to_csv_multiindex__') as path: # empty tsframe[:0].to_csv(path) recons = self.read_csv(path) exp = tsframe[:0] exp.index = [] tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df= mkdf(10, 3) df.columns = ['a','a','b'] cols = ['b','a'] result = df[['b','a']].columns expected = Index(['b','a','a']) self.assert_(result.equals(expected)) # across dtypes df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('aaaaaaa')) df.head() str(df) result = DataFrame([[1,2,1.,2.,3.,'foo','bar']]) result.columns = list('aaaaaaa') df_v = df.iloc[:,4] res_v = result.iloc[:,4] assert_frame_equal(df,result) # GH 3561, dups not in selected order df = DataFrame({'test': [5,7,9,11]}, index=['A', 'A', 'B', 'C']) rows = ['C', 'B'] expected = DataFrame({'test' : [11,9]},index=rows) result = df.ix[rows] assert_frame_equal(result, expected) result = df.ix[Index(rows)] assert_frame_equal(result, expected) rows = ['C','B','E'] expected = DataFrame({'test' : [11,9,np.nan]},index=rows) result = df.ix[rows] assert_frame_equal(result, expected) # inconsistent returns for unique/duplicate indices when values are missing df = DataFrame(randn(4,3),index=list('ABCD')) expected = df.ix[['E']] dfnu = DataFrame(randn(5,3),index=list('AABCD')) result = dfnu.ix[['E']] assert_frame_equal(result, expected) # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) result = df.ix[[0,8,0]] expected = DataFrame({"A": [0, np.nan, 0]},index=[0,8,0]) assert_frame_equal(result,expected) df = DataFrame({"A": list('abc')}) result = df.ix[[0,8,0]] expected = DataFrame({"A": ['a', np.nan, 'a']},index=[0,8,0]) assert_frame_equal(result,expected) # non unique with non unique selector df = DataFrame({'test': [5,7,9,11]}, index=['A','A','B','C']) expected = DataFrame({'test' : [5,7,5,7,np.nan]},index=['A','A','A','A','E']) result = df.ix[['A','A','E']] assert_frame_equal(result, expected)
def df(request): data_type = request.param if data_type == "delims": return pd.DataFrame({ "a": ['"a,\t"b|c', "d\tef´"], "b": ["hi'j", "k''lm"] }) elif data_type == "utf8": return pd.DataFrame({"a": ["µasd", "Ωœ∑´"], "b": ["øπ∆˚¬", "œ∑´®"]}) elif data_type == "utf16": return pd.DataFrame({ "a": ["\U0001f44d\U0001f44d", "\U0001f44d\U0001f44d"], "b": ["abc", "def"] }) elif data_type == "string": return mkdf(5, 3, c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None]) elif data_type == "long": max_rows = get_option("display.max_rows") return mkdf( max_rows + 1, 3, data_gen_f=lambda *args: randint(2), c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None], ) elif data_type == "nonascii": return pd.DataFrame({ "en": "in English".split(), "es": "en español".split() }) elif data_type == "colwidth": _cw = get_option("display.max_colwidth") + 1 return mkdf( 5, 3, data_gen_f=lambda *args: "x" * _cw, c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None], ) elif data_type == "mixed": return DataFrame({ "a": np.arange(1.0, 6.0) + 0.01, "b": np.arange(1, 6), "c": list("abcde") }) elif data_type == "float": return mkdf( 5, 3, data_gen_f=lambda r, c: float(r) + 0.01, c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None], ) elif data_type == "int": return mkdf( 5, 3, data_gen_f=lambda *args: randint(2), c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None], ) else: raise ValueError