Example #1
0
    def setUpClass(cls):
        super(TestClipboard, cls).setUpClass()
        cls.data = {}
        cls.data['string'] = mkdf(5, 3, c_idx_type='s', r_idx_type='i',
                                  c_idx_names=[None], r_idx_names=[None])
        cls.data['int'] = mkdf(5, 3, data_gen_f=lambda *args: randint(2),
                               c_idx_type='s', r_idx_type='i',
                               c_idx_names=[None], r_idx_names=[None])
        cls.data['float'] = mkdf(5, 3,
                                 data_gen_f=lambda r, c: float(r) + 0.01,
                                 c_idx_type='s', r_idx_type='i',
                                 c_idx_names=[None], r_idx_names=[None])
        cls.data['mixed'] = DataFrame({'a': np.arange(1.0, 6.0) + 0.01,
                                       'b': np.arange(1, 6),
                                       'c': list('abcde')})

        # Test columns exceeding "max_colwidth" (GH8305)
        _cw = get_option('display.max_colwidth') + 1
        cls.data['colwidth'] = mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw,
                                    c_idx_type='s', r_idx_type='i',
                                    c_idx_names=[None], r_idx_names=[None])
        # Test GH-5346
        max_rows = get_option('display.max_rows')
        cls.data['longdf'] = mkdf(max_rows + 1, 3,
                                  data_gen_f=lambda *args: randint(2),
                                  c_idx_type='s', r_idx_type='i',
                                  c_idx_names=[None], r_idx_names=[None])
        # Test for non-ascii text: GH9263
        cls.data['nonascii'] = pd.DataFrame({'en': 'in English'.split(),
                                             'es': 'en español'.split()})
        cls.data_types = list(cls.data.keys())
Example #2
0
    def setup_class(cls):
        cls.data = {}
        cls.data['string'] = mkdf(5, 3, c_idx_type='s', r_idx_type='i',
                                  c_idx_names=[None], r_idx_names=[None])
        cls.data['int'] = mkdf(5, 3, data_gen_f=lambda *args: randint(2),
                               c_idx_type='s', r_idx_type='i',
                               c_idx_names=[None], r_idx_names=[None])
        cls.data['float'] = mkdf(5, 3,
                                 data_gen_f=lambda r, c: float(r) + 0.01,
                                 c_idx_type='s', r_idx_type='i',
                                 c_idx_names=[None], r_idx_names=[None])
        cls.data['mixed'] = DataFrame({'a': np.arange(1.0, 6.0) + 0.01,
                                       'b': np.arange(1, 6),
                                       'c': list('abcde')})

        # Test columns exceeding "max_colwidth" (GH8305)
        _cw = get_option('display.max_colwidth') + 1
        cls.data['colwidth'] = mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw,
                                    c_idx_type='s', r_idx_type='i',
                                    c_idx_names=[None], r_idx_names=[None])
        # Test GH-5346
        max_rows = get_option('display.max_rows')
        cls.data['longdf'] = mkdf(max_rows + 1, 3,
                                  data_gen_f=lambda *args: randint(2),
                                  c_idx_type='s', r_idx_type='i',
                                  c_idx_names=[None], r_idx_names=[None])
        # Test for non-ascii text: GH9263
        cls.data['nonascii'] = pd.DataFrame({'en': 'in English'.split(),
                                             'es': 'en español'.split()})
        # unicode round trip test for GH 13747, GH 12529
        cls.data['utf8'] = pd.DataFrame({'a': ['µasd', 'Ωœ∑´'],
                                         'b': ['øπ∆˚¬', 'œ∑´®']})
        cls.data_types = list(cls.data.keys())
Example #3
0
    def test_excel_010_hemstring(self):
        try:
            import xlwt
            import openpyxl
        except ImportError:
            raise nose.SkipTest

        from pandas.util.testing import makeCustomDataframe as mkdf

        # ensure limited functionality in 0.10
        # override of #2370 until sorted out in 0.11

        def roundtrip(df, header=True, parser_hdr=0):
            path = "__tmp__test_xl_010_%s__.xls" % np.random.randint(1, 10000)
            df.to_excel(path, header=header)
            xf = pd.ExcelFile(path)
            try:
                res = xf.parse(xf.sheet_names[0], header=parser_hdr)
                return res
            finally:
                os.remove(path)

        nrows = 5
        ncols = 3

        for i in range(1, 4):  # row multindex upto nlevel=3
            for j in range(1, 4):  # col ""
                df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j)
                res = roundtrip(df)
                # shape
                self.assertEqual(res.shape, (nrows, ncols + i))

                # no nans
                for r in range(len(res.index)):
                    for c in range(len(res.columns)):
                        self.assertTrue(res.ix[r, c] is not np.nan)

        for i in range(1, 4):  # row multindex upto nlevel=3
            for j in range(1, 4):  # col ""
                df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j)
                res = roundtrip(df, False)
                # shape
                self.assertEqual(res.shape, (nrows - 1, ncols + i))  # first row taken as columns

                # no nans
                for r in range(len(res.index)):
                    for c in range(len(res.columns)):
                        self.assertTrue(res.ix[r, c] is not np.nan)

        res = roundtrip(DataFrame([0]))
        self.assertEqual(res.shape, (1, 1))
        self.assertTrue(res.ix[0, 0] is not np.nan)

        res = roundtrip(DataFrame([0]), False, None)
        self.assertEqual(res.shape, (1, 2))
        self.assertTrue(res.ix[0, 0] is not np.nan)
Example #4
0
    def test_excel_010_hemstring(self):
        _skip_if_no_xlrd()

        if self.merge_cells:
            raise nose.SkipTest('Skip tests for merged MI format.')

        from pandas.util.testing import makeCustomDataframe as mkdf
        # ensure limited functionality in 0.10
        # override of #2370 until sorted out in 0.11

        def roundtrip(df, header=True, parser_hdr=0):

            with ensure_clean(self.ext) as path:
                df.to_excel(path, header=header, merge_cells=self.merge_cells)
                xf = pd.ExcelFile(path)
                res = xf.parse(xf.sheet_names[0], header=parser_hdr)
                return res

        nrows = 5
        ncols = 3

        for i in range(1, 4):  # row multindex upto nlevel=3
            for j in range(1, 4):  # col ""
                df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j)
                res = roundtrip(df)
                # shape
                self.assertEqual(res.shape, (nrows, ncols + i))

                # no nans
                for r in range(len(res.index)):
                    for c in range(len(res.columns)):
                        self.assertTrue(res.ix[r, c] is not np.nan)

        for i in range(1, 4):  # row multindex upto nlevel=3
            for j in range(1, 4):  # col ""
                df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j)
                res = roundtrip(df, False)
                # shape
                self.assertEqual(res.shape, (
                    nrows - 1, ncols + i))  # first row taken as columns

                # no nans
                for r in range(len(res.index)):
                    for c in range(len(res.columns)):
                        self.assertTrue(res.ix[r, c] is not np.nan)

        res = roundtrip(DataFrame([0]))
        self.assertEqual(res.shape, (1, 1))
        self.assertTrue(res.ix[0, 0] is not np.nan)

        res = roundtrip(DataFrame([0]), False, None)
        self.assertEqual(res.shape, (1, 2))
        self.assertTrue(res.ix[0, 0] is not np.nan)
Example #5
0
 def setUpClass(cls):
     cls.data = {}
     cls.data['string'] = mkdf(5, 3, c_idx_type='s', r_idx_type='i',
                               c_idx_names=[None], r_idx_names=[None])
     cls.data['int'] = mkdf(5, 3, data_gen_f=lambda *args: randint(2),
                            c_idx_type='s', r_idx_type='i',
                            c_idx_names=[None], r_idx_names=[None])
     cls.data['float'] = mkdf(5, 3,
                              data_gen_f=lambda r, c: float(r) + 0.01,
                              c_idx_type='s', r_idx_type='i',
                              c_idx_names=[None], r_idx_names=[None])
     cls.data['mixed'] = DataFrame({'a': np.arange(1.0, 6.0) + 0.01,
                                    'b': np.arange(1, 6),
                                    'c': list('abcde')})
     cls.data_types = list(cls.data.keys())
Example #6
0
    def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names,
                                       c_idx_levels, r_idx_levels):
        # see gh-4679
        with ensure_clean(ext) as pth:
            if c_idx_levels == 1 and c_idx_names:
                pytest.skip("Column index name cannot be "
                            "serialized unless it's a MultiIndex")

            # Empty name case current read in as
            # unnamed levels, not Nones.
            check_names = r_idx_names or r_idx_levels <= 1

            df = mkdf(5, 5, c_idx_names, r_idx_names,
                      c_idx_levels, r_idx_levels)
            df.to_excel(pth)

            act = pd.read_excel(pth, index_col=list(range(r_idx_levels)),
                                header=list(range(c_idx_levels)))
            tm.assert_frame_equal(df, act, check_names=check_names)

            df.iloc[0, :] = np.nan
            df.to_excel(pth)

            act = pd.read_excel(pth, index_col=list(range(r_idx_levels)),
                                header=list(range(c_idx_levels)))
            tm.assert_frame_equal(df, act, check_names=check_names)

            df.iloc[-1, :] = np.nan
            df.to_excel(pth)
            act = pd.read_excel(pth, index_col=list(range(r_idx_levels)),
                                header=list(range(c_idx_levels)))
            tm.assert_frame_equal(df, act, check_names=check_names)
Example #7
0
 def test_to_html_compat(self):
     df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False,
               r_idx_names=False).applymap('{0:.3f}'.format).astype(float)
     out = df.to_html()
     res = self.read_html(out, attrs={'class': 'dataframe'},
                              index_col=0)[0]
     tm.assert_frame_equal(res, df)
Example #8
0
 def test_to_csv_legacy_raises_on_dupe_cols(self):
     df = mkdf(10, 3)
     df.columns = ['a', 'a', 'b']
     with ensure_clean() as path:
         with tm.assert_produces_warning(FutureWarning,
                                         check_stacklevel=False):
             self.assertRaises(NotImplementedError,
                               df.to_csv, path, engine='python')
Example #9
0
 def test_to_html_compat(self):
     df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False, r_idx_names=False).applymap(
         "{0:.3f}".format
     )
     out = df.to_html()
     res = self.run_read_html(out, attrs={"class": "dataframe"}, index_col=0)[0]
     print df.dtypes
     print res.dtypes
     assert_frame_equal(res, df)
Example #10
0
def df(request):
    data_type = request.param

    if data_type == 'delims':
        return pd.DataFrame({'a': ['"a,\t"b|c', 'd\tef´'],
                             'b': ['hi\'j', 'k\'\'lm']})
    elif data_type == 'utf8':
        return pd.DataFrame({'a': ['µasd', 'Ωœ∑´'],
                             'b': ['øπ∆˚¬', 'œ∑´®']})
    elif data_type == 'utf16':
        return pd.DataFrame({'a': ['\U0001f44d\U0001f44d',
                                   '\U0001f44d\U0001f44d'],
                             'b': ['abc', 'def']})
    elif data_type == 'string':
        return mkdf(5, 3, c_idx_type='s', r_idx_type='i',
                    c_idx_names=[None], r_idx_names=[None])
    elif data_type == 'long':
        max_rows = get_option('display.max_rows')
        return mkdf(max_rows + 1, 3,
                    data_gen_f=lambda *args: randint(2),
                    c_idx_type='s', r_idx_type='i',
                    c_idx_names=[None], r_idx_names=[None])
    elif data_type == 'nonascii':
        return pd.DataFrame({'en': 'in English'.split(),
                             'es': 'en español'.split()})
    elif data_type == 'colwidth':
        _cw = get_option('display.max_colwidth') + 1
        return mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw,
                    c_idx_type='s', r_idx_type='i',
                    c_idx_names=[None], r_idx_names=[None])
    elif data_type == 'mixed':
        return DataFrame({'a': np.arange(1.0, 6.0) + 0.01,
                          'b': np.arange(1, 6),
                          'c': list('abcde')})
    elif data_type == 'float':
        return mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01,
                    c_idx_type='s', r_idx_type='i',
                    c_idx_names=[None], r_idx_names=[None])
    elif data_type == 'int':
        return mkdf(5, 3, data_gen_f=lambda *args: randint(2),
                    c_idx_type='s', r_idx_type='i',
                    c_idx_names=[None], r_idx_names=[None])
    else:
        raise ValueError
Example #11
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df= mkdf(10, 3)
        df.columns = ['a','a','b']
        cols = ['b','a']
        result = df[['b','a']].columns
        expected = Index(['b','a','a'])
        self.assert_(result.equals(expected))
Example #12
0
 def setUpClass(cls):
     cls.data = {}
     cls.data['string'] = mkdf(5, 3, c_idx_type='s', r_idx_type='i',
                               c_idx_names=[None], r_idx_names=[None])
     cls.data['int'] = mkdf(5, 3, data_gen_f=lambda *args: randint(2),
                            c_idx_type='s', r_idx_type='i',
                            c_idx_names=[None], r_idx_names=[None])
     cls.data['float'] = mkdf(5, 3,
                              data_gen_f=lambda r, c: float(r) + 0.01,
                              c_idx_type='s', r_idx_type='i',
                              c_idx_names=[None], r_idx_names=[None])
     cls.data['mixed'] = DataFrame({'a': np.arange(1.0, 6.0) + 0.01,
                                    'b': np.arange(1, 6),
                                    'c': list('abcde')})
     # Test GH-5346
     max_rows = get_option('display.max_rows')
     cls.data['longdf'] = mkdf(max_rows+1, 3, data_gen_f=lambda *args: randint(2),
                               c_idx_type='s', r_idx_type='i',
                               c_idx_names=[None], r_idx_names=[None])  
     cls.data_types = list(cls.data.keys())
Example #13
0
 def test_loc_empty_list_indexer_is_ok(self):
     from pandas.util.testing import makeCustomDataframe as mkdf
     df = mkdf(5, 2)
     # vertical empty
     tm.assert_frame_equal(df.loc[:, []], df.iloc[:, :0],
                           check_index_type=True, check_column_type=True)
     # horizontal empty
     tm.assert_frame_equal(df.loc[[], :], df.iloc[:0, :],
                           check_index_type=True, check_column_type=True)
     # horizontal empty
     tm.assert_frame_equal(df.loc[[]], df.iloc[:0, :],
                           check_index_type=True,
                           check_column_type=True)
Example #14
0
    def test_to_csv_dups_cols(self):

        df = DataFrame(np.random.randn(1000, 30), columns=lrange(
            15) + lrange(15), dtype='float64')

        with ensure_clean() as filename:
            df.to_csv(filename)  # single dtype, fine
            result = read_csv(filename, index_col=0)
            result.columns = df.columns
            assert_frame_equal(result, df)

        df_float = DataFrame(np.random.randn(1000, 3), dtype='float64')
        df_int = DataFrame(np.random.randn(1000, 3), dtype='int64')
        df_bool = DataFrame(True, index=df_float.index, columns=lrange(3))
        df_object = DataFrame('foo', index=df_float.index, columns=lrange(3))
        df_dt = DataFrame(Timestamp('20010101'),
                          index=df_float.index, columns=lrange(3))
        df = pd.concat([df_float, df_int, df_bool, df_object,
                        df_dt], axis=1, ignore_index=True)

        cols = []
        for i in range(5):
            cols.extend([0, 1, 2])
        df.columns = cols

        from pandas import to_datetime
        with ensure_clean() as filename:
            df.to_csv(filename)
            result = read_csv(filename, index_col=0)

            # date cols
            for i in ['0.4', '1.4', '2.4']:
                result[i] = to_datetime(result[i])

            result.columns = df.columns
            assert_frame_equal(result, df)

        # GH3457
        from pandas.util.testing import makeCustomDataframe as mkdf

        N = 10
        df = mkdf(N, 3)
        df.columns = ['a', 'a', 'b']

        with ensure_clean() as filename:
            df.to_csv(filename)

            # read_csv will rename the dups columns
            result = read_csv(filename, index_col=0)
            result = result.rename(columns={'a.1': 'a'})
            assert_frame_equal(result, df)
Example #15
0
    def test_concat_invalid_first_argument(self):
        df1 = mkdf(10, 2)
        df2 = mkdf(10, 2)
        self.assertRaises(TypeError, concat, df1, df2)

        # generator ok though
        concat(DataFrame(np.random.rand(5, 5)) for _ in range(3))

        # text reader ok
        # GH6583
        data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""

        reader = read_csv(StringIO(data), chunksize=1)
        result = concat(reader, ignore_index=True)
        expected = read_csv(StringIO(data))
        assert_frame_equal(result, expected)
Example #16
0
    def test_to_csv_cols_reordering(self):
        # GH3454
        import pandas as pd

        chunksize = 5
        N = int(chunksize * 2.5)

        df = mkdf(N, 3)
        cs = df.columns
        cols = [cs[2], cs[0]]

        with ensure_clean() as path:
            df.to_csv(path, columns=cols, chunksize=chunksize)
            rs_c = pd.read_csv(path, index_col=0)

        assert_frame_equal(df[cols], rs_c, check_names=False)
Example #17
0
    def test_excel_010_hemstring(self, merge_cells, engine, ext,
                                 c_idx_nlevels, r_idx_nlevels, use_headers):

        def roundtrip(data, header=True, parser_hdr=0, index=True):
            data.to_excel(self.path, header=header,
                          merge_cells=merge_cells, index=index)

            xf = ExcelFile(self.path)
            return pd.read_excel(xf, xf.sheet_names[0], header=parser_hdr)

        # Basic test.
        parser_header = 0 if use_headers else None
        res = roundtrip(DataFrame([0]), use_headers, parser_header)

        assert res.shape == (1, 2)
        assert res.iloc[0, 0] is not np.nan

        # More complex tests with multi-index.
        nrows = 5
        ncols = 3

        from pandas.util.testing import makeCustomDataframe as mkdf
        # ensure limited functionality in 0.10
        # override of gh-2370 until sorted out in 0.11

        df = mkdf(nrows, ncols, r_idx_nlevels=r_idx_nlevels,
                  c_idx_nlevels=c_idx_nlevels)

        # This if will be removed once multi-column Excel writing
        # is implemented. For now fixing gh-9794.
        if c_idx_nlevels > 1:
            with pytest.raises(NotImplementedError):
                roundtrip(df, use_headers, index=False)
        else:
            res = roundtrip(df, use_headers)

            if use_headers:
                assert res.shape == (nrows, ncols + r_idx_nlevels)
            else:
                # First row taken as columns.
                assert res.shape == (nrows - 1, ncols + r_idx_nlevels)

            # No NaNs.
            for r in range(len(res.index)):
                for c in range(len(res.columns)):
                    assert res.iloc[r, c] is not np.nan
Example #18
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf

        df = mkdf(10, 3)
        df.columns = ["a", "a", "b"]
        cols = ["b", "a"]
        result = df[["b", "a"]].columns
        expected = Index(["b", "a", "a"])
        self.assert_(result.equals(expected))

        # across dtypes
        df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa"))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]])
        result.columns = list("aaaaaaa")

        df_v = df.iloc[:, 4]
        res_v = result.iloc[:, 4]

        assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        ind = ["A", "A", "B", "C"]
        df = DataFrame({"test": range(len(ind))}, index=ind)
        rows = ["C", "B"]
        res = df.ix[rows]
        self.assert_(rows == list(res.index))

        res = df.ix[Index(rows)]
        self.assert_(Index(rows).equals(res.index))

        rows = ["C", "B", "E"]
        res = df.ix[rows]
        self.assert_(rows == list(res.index))

        # inconcistent returns for unique/duplicate indices when values are missing
        df = DataFrame(randn(4, 3), index=list("ABCD"))
        expected = df.ix[["E"]]

        dfnu = DataFrame(randn(5, 3), index=list("AABCD"))
        result = dfnu.ix[["E"]]
        assert_frame_equal(result, expected)
Example #19
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df= mkdf(10, 3)
        df.columns = ['a','a','b']
        cols = ['b','a']
        result = df[['b','a']].columns
        expected = Index(['b','a','a'])
        self.assert_(result.equals(expected))

        # across dtypes
        df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1,2,1.,2.,3.,'foo','bar']])
        result.columns = list('aaaaaaa')

        df_v  = df.iloc[:,4]
        res_v = result.iloc[:,4]

        assert_frame_equal(df,result)

        # GH 3561, dups not in selected order
        ind = ['A', 'A', 'B', 'C']
        df = DataFrame({'test':range(len(ind))}, index=ind)
        rows = ['C', 'B']
        res = df.ix[rows]
        self.assert_(rows == list(res.index))

        res = df.ix[Index(rows)]
        self.assert_(Index(rows).equals(res.index))

        rows = ['C','B','E']
        res = df.ix[rows]
        self.assert_(rows == list(res.index))

        # inconcistent returns for unique/duplicate indices when values are missing
        df = DataFrame(randn(4,3),index=list('ABCD'))
        expected = df.ix[['E']]

        dfnu = DataFrame(randn(5,3),index=list('AABCD'))
        result = dfnu.ix[['E']]
        assert_frame_equal(result, expected)
Example #20
0
    def test_to_csv_new_dupe_cols(self):
        import pandas as pd

        def _check_df(df, cols=None):
            with ensure_clean() as path:
                df.to_csv(path, columns=cols, chunksize=chunksize)
                rs_c = pd.read_csv(path, index_col=0)

                # we wrote them in a different order
                # so compare them in that order
                if cols is not None:

                    if df.columns.is_unique:
                        rs_c.columns = cols
                    else:
                        indexer, missing = df.columns.get_indexer_non_unique(
                            cols)
                        rs_c.columns = df.columns.take(indexer)

                    for c in cols:
                        obj_df = df[c]
                        obj_rs = rs_c[c]
                        if isinstance(obj_df, Series):
                            assert_series_equal(obj_df, obj_rs)
                        else:
                            assert_frame_equal(
                                obj_df, obj_rs, check_names=False)

                # wrote in the same order
                else:
                    rs_c.columns = df.columns
                    assert_frame_equal(df, rs_c, check_names=False)

        chunksize = 5
        N = int(chunksize * 2.5)

        # dupe cols
        df = mkdf(N, 3)
        df.columns = ['a', 'a', 'b']
        _check_df(df, None)

        # dupe cols with selection
        cols = ['b', 'a']
        _check_df(df, cols)
Example #21
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df= mkdf(10, 3)
        df.columns = ['a','a','b']
        cols = ['b','a']
        result = df[['b','a']].columns
        expected = Index(['b','a','a'])
        self.assert_(result.equals(expected))

        # across dtypes
        df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1,2,1.,2.,3.,'foo','bar']])
        result.columns = list('aaaaaaa')

        df_v  = df.iloc[:,4]
        res_v = result.iloc[:,4]

        assert_frame_equal(df,result)
Example #22
0
    def test_query_multiindex_get_index_resolvers(self):
        df = mkdf(10, 3, r_idx_nlevels=2, r_idx_names=['spam', 'eggs'])
        resolvers = df._get_index_resolvers()

        def to_series(mi, level):
            level_values = mi.get_level_values(level)
            s = level_values.to_series()
            s.index = mi
            return s

        col_series = df.columns.to_series()
        expected = {'index': df.index,
                    'columns': col_series,
                    'spam': to_series(df.index, 'spam'),
                    'eggs': to_series(df.index, 'eggs'),
                    'C0': col_series}
        for k, v in resolvers.items():
            if isinstance(v, Index):
                assert v.is_(expected[k])
            elif isinstance(v, Series):
                assert_series_equal(v, expected[k])
            else:
                raise AssertionError("object must be a Series or Index")
Example #23
0
    def test_to_csv_multiindex(self):

        frame = self.frame
        old_index = frame.index
        arrays = np.arange(len(old_index) * 2).reshape(2, -1)
        new_index = MultiIndex.from_arrays(arrays, names=['first', 'second'])
        frame.index = new_index

        with ensure_clean('__tmp_to_csv_multiindex__') as path:

            frame.to_csv(path, header=False)
            frame.to_csv(path, columns=['A', 'B'])

            # round trip
            frame.to_csv(path)
            df = DataFrame.from_csv(path, index_col=[0, 1], parse_dates=False)

            # TODO to_csv drops column name
            assert_frame_equal(frame, df, check_names=False)
            self.assertEqual(frame.index.names, df.index.names)

            # needed if setUP becomes a classmethod
            self.frame.index = old_index

            # try multiindex with dates
            tsframe = self.tsframe
            old_index = tsframe.index
            new_index = [old_index, np.arange(len(old_index))]
            tsframe.index = MultiIndex.from_arrays(new_index)

            tsframe.to_csv(path, index_label=['time', 'foo'])
            recons = DataFrame.from_csv(path, index_col=[0, 1])
            # TODO to_csv drops column name
            assert_frame_equal(tsframe, recons, check_names=False)

            # do not load index
            tsframe.to_csv(path)
            recons = DataFrame.from_csv(path, index_col=None)
            self.assertEqual(len(recons.columns), len(tsframe.columns) + 2)

            # no index
            tsframe.to_csv(path, index=False)
            recons = DataFrame.from_csv(path, index_col=None)
            assert_almost_equal(recons.values, self.tsframe.values)

            # needed if setUP becomes classmethod
            self.tsframe.index = old_index

        with ensure_clean('__tmp_to_csv_multiindex__') as path:
            # GH3571, GH1651, GH3141

            def _make_frame(names=None):
                if names is True:
                    names = ['first', 'second']
                return DataFrame(np.random.randint(0, 10, size=(3, 3)),
                                 columns=MultiIndex.from_tuples(
                                     [('bah', 'foo'),
                                      ('bah', 'bar'),
                                      ('ban', 'baz')], names=names),
                                 dtype='int64')

            # column & index are multi-index
            df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=[
                              0, 1], tupleize_cols=False)
            assert_frame_equal(df, result)

            # column is mi
            df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(
                path, header=[0, 1, 2, 3], index_col=0, tupleize_cols=False)
            assert_frame_equal(df, result)

            # dup column names?
            df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=[
                              0, 1, 2], tupleize_cols=False)
            assert_frame_equal(df, result)

            # writing with no index
            df = _make_frame()
            df.to_csv(path, tupleize_cols=False, index=False)
            result = read_csv(path, header=[0, 1], tupleize_cols=False)
            assert_frame_equal(df, result)

            # we lose the names here
            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=False, index=False)
            result = read_csv(path, header=[0, 1], tupleize_cols=False)
            self.assertTrue(all([x is None for x in result.columns.names]))
            result.columns.names = df.columns.names
            assert_frame_equal(df, result)

            # tupleize_cols=True and index=False
            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=True, index=False)
            result = read_csv(
                path, header=0, tupleize_cols=True, index_col=None)
            result.columns = df.columns
            assert_frame_equal(df, result)

            # whatsnew example
            df = _make_frame()
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path, header=[0, 1], index_col=[
                              0], tupleize_cols=False)
            assert_frame_equal(df, result)

            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path, header=[0, 1], index_col=[
                              0], tupleize_cols=False)
            assert_frame_equal(df, result)

            # column & index are multi-index (compatibility)
            df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=True)
            result = read_csv(path, header=0, index_col=[
                              0, 1], tupleize_cols=True)
            result.columns = df.columns
            assert_frame_equal(df, result)

            # invalid options
            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=False)

            for i in [6, 7]:
                msg = 'len of {i}, but only 5 lines in file'.format(i=i)
                with assertRaisesRegexp(CParserError, msg):
                    read_csv(path, tupleize_cols=False,
                             header=lrange(i), index_col=0)

            # write with cols
            with assertRaisesRegexp(TypeError, 'cannot specify cols with a '
                                    'MultiIndex'):
                df.to_csv(path, tupleize_cols=False, columns=['foo', 'bar'])

        with ensure_clean('__tmp_to_csv_multiindex__') as path:
            # empty
            tsframe[:0].to_csv(path)
            recons = DataFrame.from_csv(path)
            exp = tsframe[:0]
            exp.index = []

            self.assert_index_equal(recons.columns, exp.columns)
            self.assertEqual(len(recons), 0)
Example #24
0
    def test_to_csv_moar(self):

        def _do_test(df, r_dtype=None, c_dtype=None,
                     rnlvl=None, cnlvl=None, dupe_col=False):

            kwargs = dict(parse_dates=False)
            if cnlvl:
                if rnlvl is not None:
                    kwargs['index_col'] = lrange(rnlvl)
                kwargs['header'] = lrange(cnlvl)
                with ensure_clean('__tmp_to_csv_moar__') as path:
                    df.to_csv(path, encoding='utf8',
                              chunksize=chunksize, tupleize_cols=False)
                    recons = DataFrame.from_csv(
                        path, tupleize_cols=False, **kwargs)
            else:
                kwargs['header'] = 0
                with ensure_clean('__tmp_to_csv_moar__') as path:
                    df.to_csv(path, encoding='utf8', chunksize=chunksize)
                    recons = DataFrame.from_csv(path, **kwargs)

            def _to_uni(x):
                if not isinstance(x, compat.text_type):
                    return x.decode('utf8')
                return x
            if dupe_col:
                # read_Csv disambiguates the columns by
                # labeling them dupe.1,dupe.2, etc'. monkey patch columns
                recons.columns = df.columns
            if rnlvl and not cnlvl:
                delta_lvl = [recons.iloc[
                    :, i].values for i in range(rnlvl - 1)]
                ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl)
                recons.index = ix
                recons = recons.iloc[:, rnlvl - 1:]

            type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O')
            if r_dtype:
                if r_dtype == 'u':  # unicode
                    r_dtype = 'O'
                    recons.index = np.array(lmap(_to_uni, recons.index),
                                            dtype=r_dtype)
                    df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype)
                elif r_dtype == 'dt':  # unicode
                    r_dtype = 'O'
                    recons.index = np.array(lmap(Timestamp, recons.index),
                                            dtype=r_dtype)
                    df.index = np.array(
                        lmap(Timestamp, df.index), dtype=r_dtype)
                elif r_dtype == 'p':
                    r_dtype = 'O'
                    recons.index = np.array(
                        list(map(Timestamp, to_datetime(recons.index))),
                        dtype=r_dtype)
                    df.index = np.array(
                        list(map(Timestamp, df.index.to_timestamp())),
                        dtype=r_dtype)
                else:
                    r_dtype = type_map.get(r_dtype)
                    recons.index = np.array(recons.index, dtype=r_dtype)
                    df.index = np.array(df.index, dtype=r_dtype)
            if c_dtype:
                if c_dtype == 'u':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(_to_uni, recons.columns),
                                              dtype=c_dtype)
                    df.columns = np.array(
                        lmap(_to_uni, df.columns), dtype=c_dtype)
                elif c_dtype == 'dt':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(Timestamp, recons.columns),
                                              dtype=c_dtype)
                    df.columns = np.array(
                        lmap(Timestamp, df.columns), dtype=c_dtype)
                elif c_dtype == 'p':
                    c_dtype = 'O'
                    recons.columns = np.array(
                        lmap(Timestamp, to_datetime(recons.columns)),
                        dtype=c_dtype)
                    df.columns = np.array(
                        lmap(Timestamp, df.columns.to_timestamp()),
                        dtype=c_dtype)
                else:
                    c_dtype = type_map.get(c_dtype)
                    recons.columns = np.array(recons.columns, dtype=c_dtype)
                    df.columns = np.array(df.columns, dtype=c_dtype)

            assert_frame_equal(df, recons, check_names=False,
                               check_less_precise=True)

        N = 100
        chunksize = 1000

        for ncols in [4]:
            base = int((chunksize // ncols or 1) or 1)
            for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2,
                          2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
                          base - 1, base, base + 1]:
                _do_test(mkdf(nrows, ncols, r_idx_type='dt',
                              c_idx_type='s'), 'dt', 's')

        for ncols in [4]:
            base = int((chunksize // ncols or 1) or 1)
            for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2,
                          2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
                          base - 1, base, base + 1]:
                _do_test(mkdf(nrows, ncols, r_idx_type='dt',
                              c_idx_type='s'), 'dt', 's')
                pass

        for r_idx_type, c_idx_type in [('i', 'i'), ('s', 's'), ('u', 'dt'),
                                       ('p', 'p')]:
            for ncols in [1, 2, 3, 4]:
                base = int((chunksize // ncols or 1) or 1)
                for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2,
                              2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
                              base - 1, base, base + 1]:
                    _do_test(mkdf(nrows, ncols, r_idx_type=r_idx_type,
                                  c_idx_type=c_idx_type),
                             r_idx_type, c_idx_type)

        for ncols in [1, 2, 3, 4]:
            base = int((chunksize // ncols or 1) or 1)
            for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2,
                          2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
                          base - 1, base, base + 1]:
                _do_test(mkdf(nrows, ncols))

        for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]:
            df = mkdf(nrows, 3)
            cols = list(df.columns)
            cols[:2] = ["dupe", "dupe"]
            cols[-2:] = ["dupe", "dupe"]
            ix = list(df.index)
            ix[:2] = ["rdupe", "rdupe"]
            ix[-2:] = ["rdupe", "rdupe"]
            df.index = ix
            df.columns = cols
            _do_test(df, dupe_col=True)

        _do_test(DataFrame(index=lrange(10)))
        _do_test(mkdf(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2)
        for ncols in [2, 3, 4]:
            base = int(chunksize // ncols)
            for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2,
                          2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
                          base - 1, base, base + 1]:
                _do_test(mkdf(nrows, ncols, r_idx_nlevels=2), rnlvl=2)
                _do_test(mkdf(nrows, ncols, c_idx_nlevels=2), cnlvl=2)
                _do_test(mkdf(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2),
                         rnlvl=2, cnlvl=2)
Example #25
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df = mkdf(10, 3)
        df.columns = ['a', 'a', 'b']
        result = df[['b', 'a']].columns
        expected = Index(['b', 'a', 'a'])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
                       columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']])
        result.columns = list('aaaaaaa')

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {
                'test': [5, 7, 9, 11],
                'test1': [4., 5, 6, 7],
                'other': list('abcd')
            },
            index=['A', 'A', 'B', 'C'])
        rows = ['C', 'B']
        expected = DataFrame(
            {
                'test': [11, 9],
                'test1': [7., 6],
                'other': ['d', 'c']
            },
            index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ['C', 'B', 'E']
        expected = DataFrame(
            {
                'test': [11, 9, np.nan],
                'test1': [7., 6, np.nan],
                'other': ['d', 'c', np.nan]
            },
            index=rows)

        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ['F', 'G', 'H', 'C', 'B', 'E']
        expected = DataFrame(
            {
                'test': [np.nan, np.nan, np.nan, 11, 9, np.nan],
                'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan],
                'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan]
            },
            index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # inconsistent returns for unique/duplicate indices when values are
        # missing
        df = DataFrame(np.random.randn(4, 3), index=list('ABCD'))
        expected = df.reindex(['E'])

        dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD'))
        with catch_warnings(record=True):
            result = dfnu.ix[['E']]
        tm.assert_frame_equal(result, expected)

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list('abc')})
        result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C'])
        expected = DataFrame({'test': [5, 7, 5, 7, np.nan]},
                             index=['A', 'A', 'A', 'A', 'E'])
        result = df.loc[['A', 'A', 'E']]
        tm.assert_frame_equal(result, expected)

        # GH 5835
        # dups on index and missing values
        df = DataFrame(np.random.randn(5, 5),
                       columns=['A', 'B', 'B', 'B', 'A'])

        expected = pd.concat([
            df.loc[:, ['A', 'B']],
            DataFrame(np.nan, columns=['C'], index=df.index)
        ],
                             axis=1)
        result = df.loc[:, ['A', 'B', 'C']]
        tm.assert_frame_equal(result, expected)

        # GH 6504, multi-axis indexing
        df = DataFrame(np.random.randn(9, 2),
                       index=[1, 1, 1, 2, 2, 2, 3, 3, 3],
                       columns=['a', 'b'])

        expected = df.iloc[0:6]
        result = df.loc[[1, 2]]
        tm.assert_frame_equal(result, expected)

        expected = df
        result = df.loc[:, ['a', 'b']]
        tm.assert_frame_equal(result, expected)

        expected = df.iloc[0:6, :]
        result = df.loc[[1, 2], ['a', 'b']]
        tm.assert_frame_equal(result, expected)
Example #26
0
    def test_to_csv_moar(self):
        def _do_test(df,
                     r_dtype=None,
                     c_dtype=None,
                     rnlvl=None,
                     cnlvl=None,
                     dupe_col=False):

            kwargs = dict(parse_dates=False)
            if cnlvl:
                if rnlvl is not None:
                    kwargs['index_col'] = lrange(rnlvl)
                kwargs['header'] = lrange(cnlvl)

                with ensure_clean('__tmp_to_csv_moar__') as path:
                    df.to_csv(path,
                              encoding='utf8',
                              chunksize=chunksize,
                              tupleize_cols=False)
                    recons = self.read_csv(path, tupleize_cols=False, **kwargs)
            else:
                kwargs['header'] = 0

                with ensure_clean('__tmp_to_csv_moar__') as path:
                    df.to_csv(path, encoding='utf8', chunksize=chunksize)
                    recons = self.read_csv(path, **kwargs)

            def _to_uni(x):
                if not isinstance(x, compat.text_type):
                    return x.decode('utf8')
                return x

            if dupe_col:
                # read_Csv disambiguates the columns by
                # labeling them dupe.1,dupe.2, etc'. monkey patch columns
                recons.columns = df.columns
            if rnlvl and not cnlvl:
                delta_lvl = [
                    recons.iloc[:, i].values for i in range(rnlvl - 1)
                ]
                ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl)
                recons.index = ix
                recons = recons.iloc[:, rnlvl - 1:]

            type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O')
            if r_dtype:
                if r_dtype == 'u':  # unicode
                    r_dtype = 'O'
                    recons.index = np.array(lmap(_to_uni, recons.index),
                                            dtype=r_dtype)
                    df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype)
                elif r_dtype == 'dt':  # unicode
                    r_dtype = 'O'
                    recons.index = np.array(lmap(Timestamp, recons.index),
                                            dtype=r_dtype)
                    df.index = np.array(lmap(Timestamp, df.index),
                                        dtype=r_dtype)
                elif r_dtype == 'p':
                    r_dtype = 'O'
                    recons.index = np.array(list(
                        map(Timestamp, to_datetime(recons.index))),
                                            dtype=r_dtype)
                    df.index = np.array(list(
                        map(Timestamp, df.index.to_timestamp())),
                                        dtype=r_dtype)
                else:
                    r_dtype = type_map.get(r_dtype)
                    recons.index = np.array(recons.index, dtype=r_dtype)
                    df.index = np.array(df.index, dtype=r_dtype)
            if c_dtype:
                if c_dtype == 'u':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(_to_uni, recons.columns),
                                              dtype=c_dtype)
                    df.columns = np.array(lmap(_to_uni, df.columns),
                                          dtype=c_dtype)
                elif c_dtype == 'dt':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(Timestamp, recons.columns),
                                              dtype=c_dtype)
                    df.columns = np.array(lmap(Timestamp, df.columns),
                                          dtype=c_dtype)
                elif c_dtype == 'p':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(Timestamp,
                                                   to_datetime(
                                                       recons.columns)),
                                              dtype=c_dtype)
                    df.columns = np.array(lmap(Timestamp,
                                               df.columns.to_timestamp()),
                                          dtype=c_dtype)
                else:
                    c_dtype = type_map.get(c_dtype)
                    recons.columns = np.array(recons.columns, dtype=c_dtype)
                    df.columns = np.array(df.columns, dtype=c_dtype)

            assert_frame_equal(df,
                               recons,
                               check_names=False,
                               check_less_precise=True)

        N = 100
        chunksize = 1000

        for ncols in [4]:
            base = int((chunksize // ncols or 1) or 1)
            for nrows in [
                    2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N,
                    2 * N + 1, 2 * N + 2, base - 1, base, base + 1
            ]:
                _do_test(mkdf(nrows, ncols, r_idx_type='dt', c_idx_type='s'),
                         'dt', 's')

        for ncols in [4]:
            base = int((chunksize // ncols or 1) or 1)
            for nrows in [
                    2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N,
                    2 * N + 1, 2 * N + 2, base - 1, base, base + 1
            ]:
                _do_test(mkdf(nrows, ncols, r_idx_type='dt', c_idx_type='s'),
                         'dt', 's')
                pass

        for r_idx_type, c_idx_type in [('i', 'i'), ('s', 's'), ('u', 'dt'),
                                       ('p', 'p')]:
            for ncols in [1, 2, 3, 4]:
                base = int((chunksize // ncols or 1) or 1)
                for nrows in [
                        2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1,
                        2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1
                ]:
                    _do_test(
                        mkdf(nrows,
                             ncols,
                             r_idx_type=r_idx_type,
                             c_idx_type=c_idx_type), r_idx_type, c_idx_type)

        for ncols in [1, 2, 3, 4]:
            base = int((chunksize // ncols or 1) or 1)
            for nrows in [
                    10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1,
                    2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1
            ]:
                _do_test(mkdf(nrows, ncols))

        for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]:
            df = mkdf(nrows, 3)
            cols = list(df.columns)
            cols[:2] = ["dupe", "dupe"]
            cols[-2:] = ["dupe", "dupe"]
            ix = list(df.index)
            ix[:2] = ["rdupe", "rdupe"]
            ix[-2:] = ["rdupe", "rdupe"]
            df.index = ix
            df.columns = cols
            _do_test(df, dupe_col=True)

        _do_test(DataFrame(index=lrange(10)))
        _do_test(mkdf(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2)
        for ncols in [2, 3, 4]:
            base = int(chunksize // ncols)
            for nrows in [
                    10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1,
                    2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1
            ]:
                _do_test(mkdf(nrows, ncols, r_idx_nlevels=2), rnlvl=2)
                _do_test(mkdf(nrows, ncols, c_idx_nlevels=2), cnlvl=2)
                _do_test(mkdf(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2),
                         rnlvl=2,
                         cnlvl=2)
Example #27
0
 def test_select_dtypes_typecodes(self):
     # GH 11990
     df = mkdf(30, 3, data_gen_f=lambda x, y: np.random.random())
     expected = df
     FLOAT_TYPES = list(np.typecodes['AllFloat'])
     assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)
Example #28
0
    def test_to_csv_moar(self):
        def _do_test(df,
                     r_dtype=None,
                     c_dtype=None,
                     rnlvl=None,
                     cnlvl=None,
                     dupe_col=False):

            kwargs = dict(parse_dates=False)
            if cnlvl:
                if rnlvl is not None:
                    kwargs["index_col"] = list(range(rnlvl))
                kwargs["header"] = list(range(cnlvl))

                with ensure_clean("__tmp_to_csv_moar__") as path:
                    df.to_csv(path, encoding="utf8", chunksize=chunksize)
                    recons = self.read_csv(path, **kwargs)
            else:
                kwargs["header"] = 0

                with ensure_clean("__tmp_to_csv_moar__") as path:
                    df.to_csv(path, encoding="utf8", chunksize=chunksize)
                    recons = self.read_csv(path, **kwargs)

            def _to_uni(x):
                if not isinstance(x, str):
                    return x.decode("utf8")
                return x

            if dupe_col:
                # read_Csv disambiguates the columns by
                # labeling them dupe.1,dupe.2, etc'. monkey patch columns
                recons.columns = df.columns
            if rnlvl and not cnlvl:
                delta_lvl = [
                    recons.iloc[:, i].values for i in range(rnlvl - 1)
                ]
                ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl)
                recons.index = ix
                recons = recons.iloc[:, rnlvl - 1:]

            type_map = dict(i="i", f="f", s="O", u="O", dt="O", p="O")
            if r_dtype:
                if r_dtype == "u":  # unicode
                    r_dtype = "O"
                    recons.index = np.array(
                        [_to_uni(label) for label in recons.index],
                        dtype=r_dtype)
                    df.index = np.array([_to_uni(label) for label in df.index],
                                        dtype=r_dtype)
                elif r_dtype == "dt":  # unicode
                    r_dtype = "O"
                    recons.index = np.array(
                        [Timestamp(label) for label in recons.index],
                        dtype=r_dtype)
                    df.index = np.array(
                        [Timestamp(label) for label in df.index],
                        dtype=r_dtype)
                elif r_dtype == "p":
                    r_dtype = "O"
                    idx_list = to_datetime(recons.index)
                    recons.index = np.array(
                        [Timestamp(label) for label in idx_list],
                        dtype=r_dtype)
                    df.index = np.array(list(
                        map(Timestamp, df.index.to_timestamp())),
                                        dtype=r_dtype)
                else:
                    r_dtype = type_map.get(r_dtype)
                    recons.index = np.array(recons.index, dtype=r_dtype)
                    df.index = np.array(df.index, dtype=r_dtype)
            if c_dtype:
                if c_dtype == "u":
                    c_dtype = "O"
                    recons.columns = np.array(
                        [_to_uni(label) for label in recons.columns],
                        dtype=c_dtype)
                    df.columns = np.array(
                        [_to_uni(label) for label in df.columns],
                        dtype=c_dtype)
                elif c_dtype == "dt":
                    c_dtype = "O"
                    recons.columns = np.array(
                        [Timestamp(label) for label in recons.columns],
                        dtype=c_dtype)
                    df.columns = np.array(
                        [Timestamp(label) for label in df.columns],
                        dtype=c_dtype)
                elif c_dtype == "p":
                    c_dtype = "O"
                    col_list = to_datetime(recons.columns)
                    recons.columns = np.array(
                        [Timestamp(label) for label in col_list],
                        dtype=c_dtype)
                    col_list = df.columns.to_timestamp()
                    df.columns = np.array(
                        [Timestamp(label) for label in col_list],
                        dtype=c_dtype)
                else:
                    c_dtype = type_map.get(c_dtype)
                    recons.columns = np.array(recons.columns, dtype=c_dtype)
                    df.columns = np.array(df.columns, dtype=c_dtype)

            assert_frame_equal(df,
                               recons,
                               check_names=False,
                               check_less_precise=True)

        N = 100
        chunksize = 1000

        for ncols in [4]:
            base = int((chunksize // ncols or 1) or 1)
            for nrows in [
                    2,
                    10,
                    N - 1,
                    N,
                    N + 1,
                    N + 2,
                    2 * N - 2,
                    2 * N - 1,
                    2 * N,
                    2 * N + 1,
                    2 * N + 2,
                    base - 1,
                    base,
                    base + 1,
            ]:
                _do_test(mkdf(nrows, ncols, r_idx_type="dt", c_idx_type="s"),
                         "dt", "s")

        for ncols in [4]:
            base = int((chunksize // ncols or 1) or 1)
            for nrows in [
                    2,
                    10,
                    N - 1,
                    N,
                    N + 1,
                    N + 2,
                    2 * N - 2,
                    2 * N - 1,
                    2 * N,
                    2 * N + 1,
                    2 * N + 2,
                    base - 1,
                    base,
                    base + 1,
            ]:
                _do_test(mkdf(nrows, ncols, r_idx_type="dt", c_idx_type="s"),
                         "dt", "s")
                pass

        for r_idx_type, c_idx_type in [("i", "i"), ("s", "s"), ("u", "dt"),
                                       ("p", "p")]:
            for ncols in [1, 2, 3, 4]:
                base = int((chunksize // ncols or 1) or 1)
                for nrows in [
                        2,
                        10,
                        N - 1,
                        N,
                        N + 1,
                        N + 2,
                        2 * N - 2,
                        2 * N - 1,
                        2 * N,
                        2 * N + 1,
                        2 * N + 2,
                        base - 1,
                        base,
                        base + 1,
                ]:
                    _do_test(
                        mkdf(nrows,
                             ncols,
                             r_idx_type=r_idx_type,
                             c_idx_type=c_idx_type),
                        r_idx_type,
                        c_idx_type,
                    )

        for ncols in [1, 2, 3, 4]:
            base = int((chunksize // ncols or 1) or 1)
            for nrows in [
                    10,
                    N - 2,
                    N - 1,
                    N,
                    N + 1,
                    N + 2,
                    2 * N - 2,
                    2 * N - 1,
                    2 * N,
                    2 * N + 1,
                    2 * N + 2,
                    base - 1,
                    base,
                    base + 1,
            ]:
                _do_test(mkdf(nrows, ncols))

        for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]:
            df = mkdf(nrows, 3)
            cols = list(df.columns)
            cols[:2] = ["dupe", "dupe"]
            cols[-2:] = ["dupe", "dupe"]
            ix = list(df.index)
            ix[:2] = ["rdupe", "rdupe"]
            ix[-2:] = ["rdupe", "rdupe"]
            df.index = ix
            df.columns = cols
            _do_test(df, dupe_col=True)

        _do_test(DataFrame(index=np.arange(10)))
        _do_test(mkdf(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2)
        for ncols in [2, 3, 4]:
            base = int(chunksize // ncols)
            for nrows in [
                    10,
                    N - 2,
                    N - 1,
                    N,
                    N + 1,
                    N + 2,
                    2 * N - 2,
                    2 * N - 1,
                    2 * N,
                    2 * N + 1,
                    2 * N + 2,
                    base - 1,
                    base,
                    base + 1,
            ]:
                _do_test(mkdf(nrows, ncols, r_idx_nlevels=2), rnlvl=2)
                _do_test(mkdf(nrows, ncols, c_idx_nlevels=2), cnlvl=2)
                _do_test(
                    mkdf(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2),
                    rnlvl=2,
                    cnlvl=2,
                )
Example #29
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df = mkdf(10, 3)
        df.columns = ['a', 'a', 'b']
        result = df[['b', 'a']].columns
        expected = Index(['b', 'a', 'a'])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
                       columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']])
        result.columns = list('aaaaaaa')

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {
                'test': [5, 7, 9, 11],
                'test1': [4., 5, 6, 7],
                'other': list('abcd')
            },
            index=['A', 'A', 'B', 'C'])
        rows = ['C', 'B']
        expected = DataFrame(
            {
                'test': [11, 9],
                'test1': [7., 6],
                'other': ['d', 'c']
            },
            index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ['C', 'B', 'E']
        expected = DataFrame(
            {
                'test': [11, 9, np.nan],
                'test1': [7., 6, np.nan],
                'other': ['d', 'c', np.nan]
            },
            index=rows)

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ['F', 'G', 'H', 'C', 'B', 'E']
        expected = DataFrame(
            {
                'test': [np.nan, np.nan, np.nan, 11, 9, np.nan],
                'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan],
                'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan]
            },
            index=rows)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # List containing only missing label
        dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD'))
        with pytest.raises(KeyError):
            dfnu.loc[['E']]

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list('abc')})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C'])
        expected = DataFrame({'test': [5, 7, 5, 7, np.nan]},
                             index=['A', 'A', 'A', 'A', 'E'])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[['A', 'A', 'E']]
        tm.assert_frame_equal(result, expected)
Example #30
0
def my_repr(df):
    return UrlDisplay(server.get_view_url("dfView", df), "350px")._repr_html_()

# monkey patch pandas to override it's default HTML repr. This could also
# be done # upstream as part of pandas itself.
# a cleaner way would be to use IPNB type-based display hooking
# google "ipython formatters for_type"
# or see
# http://ipython.org/ipython-doc/stable/api/generated/IPython.core.formatters.html

# print fancy link/embedded HTML in qtconsole/ipnb
pd.DataFrame._repr_html_ = my_repr
# now, displaying dataframes in IPython-notebook will open up
# an IFRAME with the grid view

df=mkdf(5000,10)

# now we can display the dataframe from our python prompt
# and view the url or rendered HTML
# >>> df

# try to modify the datdrame inplace, and refresh the grid
# with the bottom left-hand button
# df.ix[0,0]="pooh"


# when you're done, shutdown the server to release the socket
# server.stop()


Example #31
0
def df(request):
    data_type = request.param

    if data_type == 'delims':
        return pd.DataFrame({
            'a': ['"a,\t"b|c', 'd\tef´'],
            'b': ['hi\'j', 'k\'\'lm']
        })
    elif data_type == 'utf8':
        return pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], 'b': ['øπ∆˚¬', 'œ∑´®']})
    elif data_type == 'string':
        return mkdf(5,
                    3,
                    c_idx_type='s',
                    r_idx_type='i',
                    c_idx_names=[None],
                    r_idx_names=[None])
    elif data_type == 'long':
        max_rows = get_option('display.max_rows')
        return mkdf(max_rows + 1,
                    3,
                    data_gen_f=lambda *args: randint(2),
                    c_idx_type='s',
                    r_idx_type='i',
                    c_idx_names=[None],
                    r_idx_names=[None])
    elif data_type == 'nonascii':
        return pd.DataFrame({
            'en': 'in English'.split(),
            'es': 'en español'.split()
        })
    elif data_type == 'colwidth':
        _cw = get_option('display.max_colwidth') + 1
        return mkdf(5,
                    3,
                    data_gen_f=lambda *args: 'x' * _cw,
                    c_idx_type='s',
                    r_idx_type='i',
                    c_idx_names=[None],
                    r_idx_names=[None])
    elif data_type == 'mixed':
        return DataFrame({
            'a': np.arange(1.0, 6.0) + 0.01,
            'b': np.arange(1, 6),
            'c': list('abcde')
        })
    elif data_type == 'float':
        return mkdf(5,
                    3,
                    data_gen_f=lambda r, c: float(r) + 0.01,
                    c_idx_type='s',
                    r_idx_type='i',
                    c_idx_names=[None],
                    r_idx_names=[None])
    elif data_type == 'int':
        return mkdf(5,
                    3,
                    data_gen_f=lambda *args: randint(2),
                    c_idx_type='s',
                    r_idx_type='i',
                    c_idx_names=[None],
                    r_idx_names=[None])
    else:
        raise ValueError
Example #32
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf

        df = mkdf(10, 3)
        df.columns = ["a", "a", "b"]
        result = df[["b", "a"]].columns
        expected = Index(["b", "a", "a"])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa"))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]])
        result.columns = list("aaaaaaa")

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {"test": [5, 7, 9, 11], "test1": [4.0, 5, 6, 7], "other": list("abcd")},
            index=["A", "A", "B", "C"],
        )
        rows = ["C", "B"]
        expected = DataFrame(
            {"test": [11, 9], "test1": [7.0, 6], "other": ["d", "c"]}, index=rows
        )
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ["C", "B", "E"]
        expected = DataFrame(
            {
                "test": [11, 9, np.nan],
                "test1": [7.0, 6, np.nan],
                "other": ["d", "c", np.nan],
            },
            index=rows,
        )

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ["F", "G", "H", "C", "B", "E"]
        expected = DataFrame(
            {
                "test": [np.nan, np.nan, np.nan, 11, 9, np.nan],
                "test1": [np.nan, np.nan, np.nan, 7.0, 6, np.nan],
                "other": [np.nan, np.nan, np.nan, "d", "c", np.nan],
            },
            index=rows,
        )
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # List containing only missing label
        dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD"))
        with pytest.raises(
            KeyError,
            match=re.escape(
                "\"None of [Index(['E'], dtype='object')] are in the [index]\""
            ),
        ):
            dfnu.loc[["E"]]

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list("abc")})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ["a", np.nan, "a"]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"])
        expected = DataFrame(
            {"test": [5, 7, 5, 7, np.nan]}, index=["A", "A", "A", "A", "E"]
        )
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[["A", "A", "E"]]
        tm.assert_frame_equal(result, expected)
Example #33
0
    def test_concat_invalid(self):

        # trying to concat a ndframe with a non-ndframe
        df1 = mkdf(10, 2)
        for obj in [1, dict(), [1, 2], (1, 2)]:
            self.assertRaises(TypeError, lambda x: concat([df1, obj]))
Example #34
0
 def test_to_html_compat(self):
     df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False,
               r_idx_names=False).applymap('{0:.3f}'.format).astype(float)
     out = df.to_html()
     res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0]
     tm.assert_frame_equal(res, df)
Example #35
0
    def test_to_csv_multiindex(self):

        pname = '__tmp_to_csv_multiindex__'
        frame = self.frame
        old_index = frame.index
        arrays = np.arange(len(old_index) * 2).reshape(2, -1)
        new_index = MultiIndex.from_arrays(arrays, names=['first', 'second'])
        frame.index = new_index

        with ensure_clean(pname) as path:

            frame.to_csv(path, header=False)
            frame.to_csv(path, columns=['A', 'B'])

            # round trip
            frame.to_csv(path)
            df = DataFrame.from_csv(path, index_col=[0, 1], parse_dates=False)

            # TODO to_csv drops column name
            assert_frame_equal(frame, df, check_names=False)
            self.assertEqual(frame.index.names, df.index.names)

            # needed if setUP becomes a classmethod
            self.frame.index = old_index

            # try multiindex with dates
            tsframe = self.tsframe
            old_index = tsframe.index
            new_index = [old_index, np.arange(len(old_index))]
            tsframe.index = MultiIndex.from_arrays(new_index)

            tsframe.to_csv(path, index_label=['time', 'foo'])
            recons = DataFrame.from_csv(path, index_col=[0, 1])
            # TODO to_csv drops column name
            assert_frame_equal(tsframe, recons, check_names=False)

            # do not load index
            tsframe.to_csv(path)
            recons = DataFrame.from_csv(path, index_col=None)
            np.testing.assert_equal(
                len(recons.columns), len(tsframe.columns) + 2)

            # no index
            tsframe.to_csv(path, index=False)
            recons = DataFrame.from_csv(path, index_col=None)
            assert_almost_equal(recons.values, self.tsframe.values)

            # needed if setUP becomes classmethod
            self.tsframe.index = old_index

        with ensure_clean(pname) as path:
            # GH3571, GH1651, GH3141

            def _make_frame(names=None):
                if names is True:
                    names = ['first', 'second']
                return DataFrame(np.random.randint(0, 10, size=(3, 3)),
                                 columns=MultiIndex.from_tuples(
                                     [('bah', 'foo'),
                                      ('bah', 'bar'),
                                      ('ban', 'baz')], names=names),
                                 dtype='int64')

            # column & index are multi-index
            df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=[
                              0, 1], tupleize_cols=False)
            assert_frame_equal(df, result)

            # column is mi
            df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(
                path, header=[0, 1, 2, 3], index_col=0, tupleize_cols=False)
            assert_frame_equal(df, result)

            # dup column names?
            df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=[
                              0, 1, 2], tupleize_cols=False)
            assert_frame_equal(df, result)

            # writing with no index
            df = _make_frame()
            df.to_csv(path, tupleize_cols=False, index=False)
            result = read_csv(path, header=[0, 1], tupleize_cols=False)
            assert_frame_equal(df, result)

            # we lose the names here
            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=False, index=False)
            result = read_csv(path, header=[0, 1], tupleize_cols=False)
            self.assertTrue(all([x is None for x in result.columns.names]))
            result.columns.names = df.columns.names
            assert_frame_equal(df, result)

            # tupleize_cols=True and index=False
            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=True, index=False)
            result = read_csv(
                path, header=0, tupleize_cols=True, index_col=None)
            result.columns = df.columns
            assert_frame_equal(df, result)

            # whatsnew example
            df = _make_frame()
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path, header=[0, 1], index_col=[
                              0], tupleize_cols=False)
            assert_frame_equal(df, result)

            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path, header=[0, 1], index_col=[
                              0], tupleize_cols=False)
            assert_frame_equal(df, result)

            # column & index are multi-index (compatibility)
            df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=True)
            result = read_csv(path, header=0, index_col=[
                              0, 1], tupleize_cols=True)
            result.columns = df.columns
            assert_frame_equal(df, result)

            # invalid options
            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=False)

            # catch invalid headers
            with assertRaisesRegexp(CParserError,
                                    'Passed header=\[0,1,2\] are too many '
                                    'rows for this multi_index of columns'):
                read_csv(path, tupleize_cols=False,
                         header=lrange(3), index_col=0)

            with assertRaisesRegexp(CParserError,
                                    'Passed header=\[0,1,2,3,4,5,6\], len of '
                                    '7, but only 6 lines in file'):
                read_csv(path, tupleize_cols=False,
                         header=lrange(7), index_col=0)

            for i in [4, 5, 6]:
                with tm.assertRaises(CParserError):
                    read_csv(path, tupleize_cols=False,
                             header=lrange(i), index_col=0)

            # write with cols
            with assertRaisesRegexp(TypeError, 'cannot specify cols with a '
                                    'MultiIndex'):
                df.to_csv(path, tupleize_cols=False, columns=['foo', 'bar'])

        with ensure_clean(pname) as path:
            # empty
            tsframe[:0].to_csv(path)
            recons = DataFrame.from_csv(path)
            exp = tsframe[:0]
            exp.index = []

            self.assertTrue(recons.columns.equals(exp.columns))
            self.assertEqual(len(recons), 0)
Example #36
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df = mkdf(10, 3)
        df.columns = ['a', 'a', 'b']
        result = df[['b', 'a']].columns
        expected = Index(['b', 'a', 'a'])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
                       columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']])
        result.columns = list('aaaaaaa')

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {'test': [5, 7, 9, 11],
             'test1': [4., 5, 6, 7],
             'other': list('abcd')}, index=['A', 'A', 'B', 'C'])
        rows = ['C', 'B']
        expected = DataFrame(
            {'test': [11, 9],
             'test1': [7., 6],
             'other': ['d', 'c']}, index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ['C', 'B', 'E']
        expected = DataFrame(
            {'test': [11, 9, np.nan],
             'test1': [7., 6, np.nan],
             'other': ['d', 'c', np.nan]}, index=rows)

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ['F', 'G', 'H', 'C', 'B', 'E']
        expected = DataFrame({'test': [np.nan, np.nan, np.nan, 11, 9, np.nan],
                              'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan],
                              'other': [np.nan, np.nan, np.nan,
                                        'd', 'c', np.nan]},
                             index=rows)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # List containing only missing label
        dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD'))
        with pytest.raises(KeyError):
            dfnu.loc[['E']]

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list('abc')})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C'])
        expected = DataFrame(
            {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E'])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[['A', 'A', 'E']]
        tm.assert_frame_equal(result, expected)
Example #37
0
    def test_to_csv_moar(self):
        path = '__tmp_to_csv_moar__'

        def _do_test(df, path, r_dtype=None, c_dtype=None,
                     rnlvl=None, cnlvl=None, dupe_col=False):

            kwargs = dict(parse_dates=False)
            if cnlvl:
                if rnlvl is not None:
                    kwargs['index_col'] = lrange(rnlvl)
                kwargs['header'] = lrange(cnlvl)
                with ensure_clean(path) as path:
                    df.to_csv(path, encoding='utf8',
                              chunksize=chunksize, tupleize_cols=False)
                    recons = DataFrame.from_csv(
                        path, tupleize_cols=False, **kwargs)
            else:
                kwargs['header'] = 0
                with ensure_clean(path) as path:
                    df.to_csv(path, encoding='utf8', chunksize=chunksize)
                    recons = DataFrame.from_csv(path, **kwargs)

            def _to_uni(x):
                if not isinstance(x, compat.text_type):
                    return x.decode('utf8')
                return x
            if dupe_col:
                # read_Csv disambiguates the columns by
                # labeling them dupe.1,dupe.2, etc'. monkey patch columns
                recons.columns = df.columns
            if rnlvl and not cnlvl:
                delta_lvl = [recons.iloc[
                    :, i].values for i in range(rnlvl - 1)]
                ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl)
                recons.index = ix
                recons = recons.iloc[:, rnlvl - 1:]

            type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O')
            if r_dtype:
                if r_dtype == 'u':  # unicode
                    r_dtype = 'O'
                    recons.index = np.array(lmap(_to_uni, recons.index),
                                            dtype=r_dtype)
                    df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype)
                elif r_dtype == 'dt':  # unicode
                    r_dtype = 'O'
                    recons.index = np.array(lmap(Timestamp, recons.index),
                                            dtype=r_dtype)
                    df.index = np.array(
                        lmap(Timestamp, df.index), dtype=r_dtype)
                elif r_dtype == 'p':
                    r_dtype = 'O'
                    recons.index = np.array(
                        list(map(Timestamp, recons.index.to_datetime())),
                        dtype=r_dtype)
                    df.index = np.array(
                        list(map(Timestamp, df.index.to_datetime())),
                        dtype=r_dtype)
                else:
                    r_dtype = type_map.get(r_dtype)
                    recons.index = np.array(recons.index, dtype=r_dtype)
                    df.index = np.array(df.index, dtype=r_dtype)
            if c_dtype:
                if c_dtype == 'u':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(_to_uni, recons.columns),
                                              dtype=c_dtype)
                    df.columns = np.array(
                        lmap(_to_uni, df.columns), dtype=c_dtype)
                elif c_dtype == 'dt':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(Timestamp, recons.columns),
                                              dtype=c_dtype)
                    df.columns = np.array(
                        lmap(Timestamp, df.columns), dtype=c_dtype)
                elif c_dtype == 'p':
                    c_dtype = 'O'
                    recons.columns = np.array(
                        lmap(Timestamp, recons.columns.to_datetime()),
                        dtype=c_dtype)
                    df.columns = np.array(
                        lmap(Timestamp, df.columns.to_datetime()),
                        dtype=c_dtype)
                else:
                    c_dtype = type_map.get(c_dtype)
                    recons.columns = np.array(recons.columns, dtype=c_dtype)
                    df.columns = np.array(df.columns, dtype=c_dtype)

            assert_frame_equal(df, recons, check_names=False,
                               check_less_precise=True)

        N = 100
        chunksize = 1000

        # GH3437
        from pandas import NaT

        def make_dtnat_arr(n, nnat=None):
            if nnat is None:
                nnat = int(n * 0.1)  # 10%
            s = list(date_range('2000', freq='5min', periods=n))
            if nnat:
                for i in np.random.randint(0, len(s), nnat):
                    s[i] = NaT
                i = np.random.randint(100)
                s[-i] = NaT
                s[i] = NaT
            return s

        # N=35000
        s1 = make_dtnat_arr(chunksize + 5)
        s2 = make_dtnat_arr(chunksize + 5, 0)
        path = '1.csv'

        # s3=make_dtnjat_arr(chunksize+5,0)
        with ensure_clean('.csv') as pth:
            df = DataFrame(dict(a=s1, b=s2))
            df.to_csv(pth, chunksize=chunksize)
            recons = DataFrame.from_csv(pth)._convert(datetime=True,
                                                      coerce=True)
            assert_frame_equal(df, recons, check_names=False,
                               check_less_precise=True)

        for ncols in [4]:
            base = int((chunksize // ncols or 1) or 1)
            for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2,
                          2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
                          base - 1, base, base + 1]:
                _do_test(mkdf(nrows, ncols, r_idx_type='dt',
                              c_idx_type='s'), path, 'dt', 's')

        for ncols in [4]:
            base = int((chunksize // ncols or 1) or 1)
            for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2,
                          2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
                          base - 1, base, base + 1]:
                _do_test(mkdf(nrows, ncols, r_idx_type='dt',
                              c_idx_type='s'), path, 'dt', 's')
                pass

        for r_idx_type, c_idx_type in [('i', 'i'), ('s', 's'), ('u', 'dt'),
                                       ('p', 'p')]:
            for ncols in [1, 2, 3, 4]:
                base = int((chunksize // ncols or 1) or 1)
                for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2,
                              2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
                              base - 1, base, base + 1]:
                    _do_test(mkdf(nrows, ncols, r_idx_type=r_idx_type,
                                  c_idx_type=c_idx_type),
                             path, r_idx_type, c_idx_type)

        for ncols in [1, 2, 3, 4]:
            base = int((chunksize // ncols or 1) or 1)
            for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2,
                          2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
                          base - 1, base, base + 1]:
                _do_test(mkdf(nrows, ncols), path)

        for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]:
            df = mkdf(nrows, 3)
            cols = list(df.columns)
            cols[:2] = ["dupe", "dupe"]
            cols[-2:] = ["dupe", "dupe"]
            ix = list(df.index)
            ix[:2] = ["rdupe", "rdupe"]
            ix[-2:] = ["rdupe", "rdupe"]
            df.index = ix
            df.columns = cols
            _do_test(df, path, dupe_col=True)

        _do_test(DataFrame(index=lrange(10)), path)
        _do_test(mkdf(chunksize // 2 + 1, 2, r_idx_nlevels=2), path, rnlvl=2)
        for ncols in [2, 3, 4]:
            base = int(chunksize // ncols)
            for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2,
                          2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
                          base - 1, base, base + 1]:
                _do_test(mkdf(nrows, ncols, r_idx_nlevels=2), path, rnlvl=2)
                _do_test(mkdf(nrows, ncols, c_idx_nlevels=2), path, cnlvl=2)
                _do_test(mkdf(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2),
                         path, rnlvl=2, cnlvl=2)
Example #38
0
 def test_select_dtypes_typecodes(self):
     # GH 11990
     df = mkdf(30, 3, data_gen_f=lambda x, y: np.random.random())
     expected = df
     FLOAT_TYPES = list(np.typecodes['AllFloat'])
     assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)
Example #39
0
    def test_to_csv_multiindex(self):

        frame = self.frame
        old_index = frame.index
        arrays = np.arange(len(old_index) * 2).reshape(2, -1)
        new_index = MultiIndex.from_arrays(arrays, names=["first", "second"])
        frame.index = new_index

        with ensure_clean("__tmp_to_csv_multiindex__") as path:

            frame.to_csv(path, header=False)
            frame.to_csv(path, columns=["A", "B"])

            # round trip
            frame.to_csv(path)

            df = self.read_csv(path, index_col=[0, 1], parse_dates=False)

            # TODO to_csv drops column name
            assert_frame_equal(frame, df, check_names=False)
            assert frame.index.names == df.index.names

            # needed if setUp becomes a class method
            self.frame.index = old_index

            # try multiindex with dates
            tsframe = self.tsframe
            old_index = tsframe.index
            new_index = [old_index, np.arange(len(old_index))]
            tsframe.index = MultiIndex.from_arrays(new_index)

            tsframe.to_csv(path, index_label=["time", "foo"])
            recons = self.read_csv(path, index_col=[0, 1])

            # TODO to_csv drops column name
            assert_frame_equal(tsframe, recons, check_names=False)

            # do not load index
            tsframe.to_csv(path)
            recons = self.read_csv(path, index_col=None)
            assert len(recons.columns) == len(tsframe.columns) + 2

            # no index
            tsframe.to_csv(path, index=False)
            recons = self.read_csv(path, index_col=None)
            assert_almost_equal(recons.values, self.tsframe.values)

            # needed if setUp becomes class method
            self.tsframe.index = old_index

        with ensure_clean("__tmp_to_csv_multiindex__") as path:
            # GH3571, GH1651, GH3141

            def _make_frame(names=None):
                if names is True:
                    names = ["first", "second"]
                return DataFrame(
                    np.random.randint(0, 10, size=(3, 3)),
                    columns=MultiIndex.from_tuples([("bah", "foo"),
                                                    ("bah", "bar"),
                                                    ("ban", "baz")],
                                                   names=names),
                    dtype="int64",
                )

            # column & index are multi-index
            df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1])
            assert_frame_equal(df, result)

            # column is mi
            df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=0)
            assert_frame_equal(df, result)

            # dup column names?
            df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2])
            assert_frame_equal(df, result)

            # writing with no index
            df = _make_frame()
            df.to_csv(path, index=False)
            result = read_csv(path, header=[0, 1])
            assert_frame_equal(df, result)

            # we lose the names here
            df = _make_frame(True)
            df.to_csv(path, index=False)
            result = read_csv(path, header=[0, 1])
            assert com._all_none(*result.columns.names)
            result.columns.names = df.columns.names
            assert_frame_equal(df, result)

            # whatsnew example
            df = _make_frame()
            df.to_csv(path)
            result = read_csv(path, header=[0, 1], index_col=[0])
            assert_frame_equal(df, result)

            df = _make_frame(True)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1], index_col=[0])
            assert_frame_equal(df, result)

            # invalid options
            df = _make_frame(True)
            df.to_csv(path)

            for i in [6, 7]:
                msg = "len of {i}, but only 5 lines in file".format(i=i)
                with pytest.raises(ParserError, match=msg):
                    read_csv(path, header=list(range(i)), index_col=0)

            # write with cols
            msg = "cannot specify cols with a MultiIndex"
            with pytest.raises(TypeError, match=msg):
                df.to_csv(path, columns=["foo", "bar"])

        with ensure_clean("__tmp_to_csv_multiindex__") as path:
            # empty
            tsframe[:0].to_csv(path)
            recons = self.read_csv(path)

            exp = tsframe[:0]
            exp.index = []

            tm.assert_index_equal(recons.columns, exp.columns)
            assert len(recons) == 0
Example #40
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df = mkdf(10, 3)
        df.columns = ['a', 'a', 'b']
        result = df[['b', 'a']].columns
        expected = Index(['b', 'a', 'a'])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
                       columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']])
        result.columns = list('aaaaaaa')

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {'test': [5, 7, 9, 11],
             'test1': [4., 5, 6, 7],
             'other': list('abcd')}, index=['A', 'A', 'B', 'C'])
        rows = ['C', 'B']
        expected = DataFrame(
            {'test': [11, 9],
             'test1': [7., 6],
             'other': ['d', 'c']}, index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ['C', 'B', 'E']
        expected = DataFrame(
            {'test': [11, 9, np.nan],
             'test1': [7., 6, np.nan],
             'other': ['d', 'c', np.nan]}, index=rows)

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ['F', 'G', 'H', 'C', 'B', 'E']
        expected = DataFrame({'test': [np.nan, np.nan, np.nan, 11, 9, np.nan],
                              'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan],
                              'other': [np.nan, np.nan, np.nan,
                                        'd', 'c', np.nan]},
                             index=rows)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # inconsistent returns for unique/duplicate indices when values are
        # missing
        df = DataFrame(np.random.randn(4, 3), index=list('ABCD'))
        expected = df.reindex(['E'])

        dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD'))
        with catch_warnings(record=True):
            result = dfnu.ix[['E']]
        tm.assert_frame_equal(result, expected)

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list('abc')})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C'])
        expected = DataFrame(
            {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E'])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[['A', 'A', 'E']]
        tm.assert_frame_equal(result, expected)

        # GH 5835
        # dups on index and missing values
        df = DataFrame(
            np.random.randn(5, 5), columns=['A', 'B', 'B', 'B', 'A'])

        expected = pd.concat(
            [df.loc[:, ['A', 'B']], DataFrame(np.nan, columns=['C'],
                                              index=df.index)], axis=1)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[:, ['A', 'B', 'C']]
        tm.assert_frame_equal(result, expected)

        # GH 6504, multi-axis indexing
        df = DataFrame(np.random.randn(9, 2),
                       index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b'])

        expected = df.iloc[0:6]
        result = df.loc[[1, 2]]
        tm.assert_frame_equal(result, expected)

        expected = df
        result = df.loc[:, ['a', 'b']]
        tm.assert_frame_equal(result, expected)

        expected = df.iloc[0:6, :]
        result = df.loc[[1, 2], ['a', 'b']]
        tm.assert_frame_equal(result, expected)
Example #41
0
    def test_to_csv_multiindex(self):

        frame = self.frame
        old_index = frame.index
        arrays = np.arange(len(old_index) * 2).reshape(2, -1)
        new_index = MultiIndex.from_arrays(arrays, names=['first', 'second'])
        frame.index = new_index

        with ensure_clean('__tmp_to_csv_multiindex__') as path:

            frame.to_csv(path, header=False)
            frame.to_csv(path, columns=['A', 'B'])

            # round trip
            frame.to_csv(path)

            df = self.read_csv(path, index_col=[0, 1], parse_dates=False)

            # TODO to_csv drops column name
            assert_frame_equal(frame, df, check_names=False)
            assert frame.index.names == df.index.names

            # needed if setUp becomes a class method
            self.frame.index = old_index

            # try multiindex with dates
            tsframe = self.tsframe
            old_index = tsframe.index
            new_index = [old_index, np.arange(len(old_index))]
            tsframe.index = MultiIndex.from_arrays(new_index)

            tsframe.to_csv(path, index_label=['time', 'foo'])
            recons = self.read_csv(path, index_col=[0, 1])

            # TODO to_csv drops column name
            assert_frame_equal(tsframe, recons, check_names=False)

            # do not load index
            tsframe.to_csv(path)
            recons = self.read_csv(path, index_col=None)
            assert len(recons.columns) == len(tsframe.columns) + 2

            # no index
            tsframe.to_csv(path, index=False)
            recons = self.read_csv(path, index_col=None)
            assert_almost_equal(recons.values, self.tsframe.values)

            # needed if setUp becomes class method
            self.tsframe.index = old_index

        with ensure_clean('__tmp_to_csv_multiindex__') as path:
            # GH3571, GH1651, GH3141

            def _make_frame(names=None):
                if names is True:
                    names = ['first', 'second']
                return DataFrame(np.random.randint(0, 10, size=(3, 3)),
                                 columns=MultiIndex.from_tuples(
                                     [('bah', 'foo'), ('bah', 'bar'),
                                      ('ban', 'baz')],
                                     names=names),
                                 dtype='int64')

            # column & index are multi-index
            df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path,
                              header=[0, 1, 2, 3],
                              index_col=[0, 1],
                              tupleize_cols=False)
            assert_frame_equal(df, result)

            # column is mi
            df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path,
                              header=[0, 1, 2, 3],
                              index_col=0,
                              tupleize_cols=False)
            assert_frame_equal(df, result)

            # dup column names?
            df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path,
                              header=[0, 1, 2, 3],
                              index_col=[0, 1, 2],
                              tupleize_cols=False)
            assert_frame_equal(df, result)

            # writing with no index
            df = _make_frame()
            df.to_csv(path, tupleize_cols=False, index=False)
            result = read_csv(path, header=[0, 1], tupleize_cols=False)
            assert_frame_equal(df, result)

            # we lose the names here
            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=False, index=False)
            result = read_csv(path, header=[0, 1], tupleize_cols=False)
            assert _all_none(*result.columns.names)
            result.columns.names = df.columns.names
            assert_frame_equal(df, result)

            # tupleize_cols=True and index=False
            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=True, index=False)

            with tm.assert_produces_warning(FutureWarning,
                                            check_stacklevel=False):
                result = read_csv(path,
                                  header=0,
                                  tupleize_cols=True,
                                  index_col=None)
            result.columns = df.columns
            assert_frame_equal(df, result)

            # whatsnew example
            df = _make_frame()
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path,
                              header=[0, 1],
                              index_col=[0],
                              tupleize_cols=False)
            assert_frame_equal(df, result)

            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path,
                              header=[0, 1],
                              index_col=[0],
                              tupleize_cols=False)
            assert_frame_equal(df, result)

            # column & index are multi-index (compatibility)
            df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=True)

            with tm.assert_produces_warning(FutureWarning,
                                            check_stacklevel=False):
                result = read_csv(path,
                                  header=0,
                                  index_col=[0, 1],
                                  tupleize_cols=True)
            result.columns = df.columns
            assert_frame_equal(df, result)

            # invalid options
            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=False)

            for i in [6, 7]:
                msg = 'len of {i}, but only 5 lines in file'.format(i=i)
                with tm.assert_raises_regex(ParserError, msg):
                    read_csv(path,
                             tupleize_cols=False,
                             header=lrange(i),
                             index_col=0)

            # write with cols
            with tm.assert_raises_regex(
                    TypeError, 'cannot specify cols '
                    'with a MultiIndex'):
                df.to_csv(path, tupleize_cols=False, columns=['foo', 'bar'])

        with ensure_clean('__tmp_to_csv_multiindex__') as path:
            # empty
            tsframe[:0].to_csv(path)
            recons = self.read_csv(path)

            exp = tsframe[:0]
            exp.index = []

            tm.assert_index_equal(recons.columns, exp.columns)
            assert len(recons) == 0
Example #42
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df= mkdf(10, 3)
        df.columns = ['a','a','b']
        cols = ['b','a']
        result = df[['b','a']].columns
        expected = Index(['b','a','a'])
        self.assert_(result.equals(expected))

        # across dtypes
        df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1,2,1.,2.,3.,'foo','bar']])
        result.columns = list('aaaaaaa')

        df_v  = df.iloc[:,4]
        res_v = result.iloc[:,4]

        assert_frame_equal(df,result)

        # GH 3561, dups not in selected order
        df = DataFrame({'test': [5,7,9,11]}, index=['A', 'A', 'B', 'C'])
        rows = ['C', 'B']
        expected = DataFrame({'test' : [11,9]},index=rows)
        result = df.ix[rows]
        assert_frame_equal(result, expected)

        result = df.ix[Index(rows)]
        assert_frame_equal(result, expected)

        rows = ['C','B','E']
        expected = DataFrame({'test' : [11,9,np.nan]},index=rows)
        result = df.ix[rows]
        assert_frame_equal(result, expected)

        # inconsistent returns for unique/duplicate indices when values are missing
        df = DataFrame(randn(4,3),index=list('ABCD'))
        expected = df.ix[['E']]

        dfnu = DataFrame(randn(5,3),index=list('AABCD'))
        result = dfnu.ix[['E']]
        assert_frame_equal(result, expected)

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        result = df.ix[[0,8,0]]
        expected = DataFrame({"A": [0, np.nan, 0]},index=[0,8,0])
        assert_frame_equal(result,expected)

        df = DataFrame({"A": list('abc')})
        result = df.ix[[0,8,0]]
        expected = DataFrame({"A": ['a', np.nan, 'a']},index=[0,8,0])
        assert_frame_equal(result,expected)

        # non unique with non unique selector
        df = DataFrame({'test': [5,7,9,11]}, index=['A','A','B','C'])
        expected = DataFrame({'test' : [5,7,5,7,np.nan]},index=['A','A','A','A','E'])
        result = df.ix[['A','A','E']]
        assert_frame_equal(result, expected)
Example #43
0
def df(request):
    data_type = request.param

    if data_type == "delims":
        return pd.DataFrame({
            "a": ['"a,\t"b|c', "d\tef´"],
            "b": ["hi'j", "k''lm"]
        })
    elif data_type == "utf8":
        return pd.DataFrame({"a": ["µasd", "Ωœ∑´"], "b": ["øπ∆˚¬", "œ∑´®"]})
    elif data_type == "utf16":
        return pd.DataFrame({
            "a": ["\U0001f44d\U0001f44d", "\U0001f44d\U0001f44d"],
            "b": ["abc", "def"]
        })
    elif data_type == "string":
        return mkdf(5,
                    3,
                    c_idx_type="s",
                    r_idx_type="i",
                    c_idx_names=[None],
                    r_idx_names=[None])
    elif data_type == "long":
        max_rows = get_option("display.max_rows")
        return mkdf(
            max_rows + 1,
            3,
            data_gen_f=lambda *args: randint(2),
            c_idx_type="s",
            r_idx_type="i",
            c_idx_names=[None],
            r_idx_names=[None],
        )
    elif data_type == "nonascii":
        return pd.DataFrame({
            "en": "in English".split(),
            "es": "en español".split()
        })
    elif data_type == "colwidth":
        _cw = get_option("display.max_colwidth") + 1
        return mkdf(
            5,
            3,
            data_gen_f=lambda *args: "x" * _cw,
            c_idx_type="s",
            r_idx_type="i",
            c_idx_names=[None],
            r_idx_names=[None],
        )
    elif data_type == "mixed":
        return DataFrame({
            "a": np.arange(1.0, 6.0) + 0.01,
            "b": np.arange(1, 6),
            "c": list("abcde")
        })
    elif data_type == "float":
        return mkdf(
            5,
            3,
            data_gen_f=lambda r, c: float(r) + 0.01,
            c_idx_type="s",
            r_idx_type="i",
            c_idx_names=[None],
            r_idx_names=[None],
        )
    elif data_type == "int":
        return mkdf(
            5,
            3,
            data_gen_f=lambda *args: randint(2),
            c_idx_type="s",
            r_idx_type="i",
            c_idx_names=[None],
            r_idx_names=[None],
        )
    else:
        raise ValueError