Example #1
0
    def test_register_writer(self):
        # some awkward mocking to test out dispatch and such actually works
        called_save = []
        called_write_cells = []

        class DummyClass(ExcelWriter):
            called_save = False
            called_write_cells = False
            supported_extensions = ['xlsx', 'xls']
            engine = 'dummy'

            def save(self):
                called_save.append(True)

            def write_cells(self, *args, **kwargs):
                called_write_cells.append(True)

        def check_called(func):
            func()
            assert len(called_save) >= 1
            assert len(called_write_cells) >= 1
            del called_save[:]
            del called_write_cells[:]

        with pd.option_context('io.excel.xlsx.writer', 'dummy'):
            register_writer(DummyClass)
            writer = ExcelWriter('something.xlsx')
            assert isinstance(writer, DummyClass)
            df = tm.makeCustomDataframe(1, 1)
            check_called(lambda: df.to_excel('something.xlsx'))
            check_called(
                lambda: df.to_excel(
                    'something.xls', engine='dummy'))
Example #2
0
    def test_join_with_period_index(self, join_type):
        df = tm.makeCustomDataframe(
            10, 10, data_gen_f=lambda *args: np.random.randint(2),
            c_idx_type='p', r_idx_type='dt')
        s = df.iloc[:5, 0]

        msg = 'can only call with other PeriodIndex-ed objects'
        with pytest.raises(ValueError, match=msg):
            df.columns.join(s.index, how=join_type)
Example #3
0
    def test_does_not_convert_mixed_integer(self):
        df = tm.makeCustomDataframe(10, 10, data_gen_f=lambda *args, **kwargs: randn(), r_idx_type="i", c_idx_type="td")
        str(df)

        cols = df.columns.join(df.index, how="outer")
        joined = cols.join(df.columns)
        self.assertEqual(cols.dtype, np.dtype("O"))
        self.assertEqual(cols.dtype, joined.dtype)
        tm.assert_index_equal(cols, joined)
Example #4
0
    def test_join_with_period_index(self, join_type):
        df = tm.makeCustomDataframe(
            10, 10, data_gen_f=lambda *args: np.random.randint(2),
            c_idx_type='p', r_idx_type='dt')
        s = df.iloc[:5, 0]

        expected = df.columns.astype('O').join(s.index, how=join_type)
        result = df.columns.join(s.index, how=join_type)
        tm.assert_index_equal(expected, result)
Example #5
0
 def test_does_not_convert_mixed_integer(self):
     df = tm.makeCustomDataframe(10, 10,
                                 data_gen_f=lambda *args, **kwargs: randn(),
                                 r_idx_type='i', c_idx_type='dt')
     cols = df.columns.join(df.index, how='outer')
     joined = cols.join(df.columns)
     assert cols.dtype == np.dtype('O')
     assert cols.dtype == joined.dtype
     tm.assert_numpy_array_equal(cols.values, joined.values)
Example #6
0
    def test_join_does_not_recur(self):
        df = tm.makeCustomDataframe(
            3, 2, data_gen_f=lambda *args: np.random.randint(2),
            c_idx_type='p', r_idx_type='dt')
        s = df.iloc[:2, 0]

        res = s.index.join(df.columns, how='outer')
        expected = Index([s.index[0], s.index[1],
                          df.columns[0], df.columns[1]], object)
        tm.assert_index_equal(res, expected)
Example #7
0
    def test_join_with_period_index(self, join_type):
        df = tm.makeCustomDataframe(
            10, 10, data_gen_f=lambda *args: np.random.randint(2),
            c_idx_type='p', r_idx_type='dt')
        s = df.iloc[:5, 0]

        with tm.assert_raises_regex(ValueError,
                                    'can only call with other '
                                    'PeriodIndex-ed objects'):
            df.columns.join(s.index, how=join_type)
Example #8
0
    def test_join_with_period_index(self):
        df = tm.makeCustomDataframe(
            10, 10, data_gen_f=lambda *args: np.random.randint(2),
            c_idx_type='p', r_idx_type='dt')
        s = df.iloc[:5, 0]
        joins = 'left', 'right', 'inner', 'outer'

        for join in joins:
            with tm.assertRaisesRegexp(ValueError, 'can only call with other '
                                       'PeriodIndex-ed objects'):
                df.columns.join(s.index, how=join)
Example #9
0
    def test_header_multi_index(self):
        expected = tm.makeCustomDataframe(
            5, 3, r_idx_nlevels=2, c_idx_nlevels=4)

        data = """\
C0,,C_l0_g0,C_l0_g1,C_l0_g2

C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
"""

        df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[
            0, 1], tupleize_cols=False)
        tm.assert_frame_equal(df, expected)

        # skipping lines in the header
        df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[
            0, 1], tupleize_cols=False)
        tm.assert_frame_equal(df, expected)

        # INVALID OPTIONS

        # no as_recarray
        with tm.assert_produces_warning(
                FutureWarning, check_stacklevel=False):
            pytest.raises(ValueError, self.read_csv,
                          StringIO(data), header=[0, 1, 2, 3],
                          index_col=[0, 1], as_recarray=True,
                          tupleize_cols=False)

        # names
        pytest.raises(ValueError, self.read_csv,
                      StringIO(data), header=[0, 1, 2, 3],
                      index_col=[0, 1], names=['foo', 'bar'],
                      tupleize_cols=False)

        # usecols
        pytest.raises(ValueError, self.read_csv,
                      StringIO(data), header=[0, 1, 2, 3],
                      index_col=[0, 1], usecols=['foo', 'bar'],
                      tupleize_cols=False)

        # non-numeric index_col
        pytest.raises(ValueError, self.read_csv,
                      StringIO(data), header=[0, 1, 2, 3],
                      index_col=['foo', 'bar'], tupleize_cols=False)
Example #10
0
def test_slice_locs_with_type_mismatch():
    df = tm.makeTimeDataFrame()
    stacked = df.stack()
    idx = stacked.index
    with pytest.raises(TypeError, match='^Level type mismatch'):
        idx.slice_locs((1, 3))
    with pytest.raises(TypeError, match='^Level type mismatch'):
        idx.slice_locs(df.index[5] + timedelta(seconds=30), (5, 2))
    df = tm.makeCustomDataframe(5, 5)
    stacked = df.stack()
    idx = stacked.index
    with pytest.raises(TypeError, match='^Level type mismatch'):
        idx.slice_locs(timedelta(seconds=30))
    # TODO: Try creating a UnicodeDecodeError in exception message
    with pytest.raises(TypeError, match='^Level type mismatch'):
        idx.slice_locs(df.index[1], (16, "a"))
Example #11
0
    def test_header_multi_index(self):
        expected = tm.makeCustomDataframe(
            5, 3, r_idx_nlevels=2, c_idx_nlevels=4)

        data = """\
C0,,C_l0_g0,C_l0_g1,C_l0_g2

C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
"""

        df = self.read_csv(StringIO(data), header=[0, 1, 2, 3],
                           index_col=[0, 1])
        tm.assert_frame_equal(df, expected)

        # skipping lines in the header
        df = self.read_csv(StringIO(data), header=[0, 1, 2, 3],
                           index_col=[0, 1])
        tm.assert_frame_equal(df, expected)

        # INVALID OPTIONS

        # names
        pytest.raises(ValueError, self.read_csv,
                      StringIO(data), header=[0, 1, 2, 3],
                      index_col=[0, 1], names=['foo', 'bar'])

        # usecols
        pytest.raises(ValueError, self.read_csv,
                      StringIO(data), header=[0, 1, 2, 3],
                      index_col=[0, 1], usecols=['foo', 'bar'])

        # non-numeric index_col
        pytest.raises(ValueError, self.read_csv,
                      StringIO(data), header=[0, 1, 2, 3],
                      index_col=['foo', 'bar'])
Example #12
0
def test_header_multi_index(all_parsers):
    parser = all_parsers
    expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)

    data = """\
C0,,C_l0_g0,C_l0_g1,C_l0_g2

C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
"""
    result = parser.read_csv(StringIO(data),
                             header=[0, 1, 2, 3],
                             index_col=[0, 1])
    tm.assert_frame_equal(result, expected)
Example #13
0
def test_header_multi_index(all_parsers):
    parser = all_parsers
    expected = tm.makeCustomDataframe(
        5, 3, r_idx_nlevels=2, c_idx_nlevels=4)

    data = """\
C0,,C_l0_g0,C_l0_g1,C_l0_g2

C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
"""
    result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3],
                             index_col=[0, 1])
    tm.assert_frame_equal(result, expected)
Example #14
0
    def test_iloc_empty_list_indexer_is_ok(self):

        df = tm.makeCustomDataframe(5, 2)
        # vertical empty
        tm.assert_frame_equal(
            df.iloc[:, []],
            df.iloc[:, :0],
            check_index_type=True,
            check_column_type=True,
        )
        # horizontal empty
        tm.assert_frame_equal(
            df.iloc[[], :],
            df.iloc[:0, :],
            check_index_type=True,
            check_column_type=True,
        )
        # horizontal empty
        tm.assert_frame_equal(
            df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True
        )
Example #15
0
    def test_register_writer(self):
        # some awkward mocking to test out dispatch and such actually works
        called_save = []
        called_write_cells = []

        class DummyClass(ExcelWriter):
            called_save = False
            called_write_cells = False
            supported_extensions = ['test', 'xlsx', 'xls']
            engine = 'dummy'

            def save(self):
                called_save.append(True)

            def write_cells(self, *args, **kwargs):
                called_write_cells.append(True)

        def check_called(func):
            func()
            self.assert_(len(called_save) >= 1)
            self.assert_(len(called_write_cells) >= 1)
            del called_save[:]
            del called_write_cells[:]

        register_writer(DummyClass)
        writer = ExcelWriter('something.test')
        tm.assert_isinstance(writer, DummyClass)
        df = tm.makeCustomDataframe(1, 1)
        panel = tm.makePanel()
        func = lambda: df.to_excel('something.test')
        check_called(func)
        check_called(lambda: panel.to_excel('something.test'))
        from pandas import set_option, get_option
        val = get_option('io.excel.xlsx.writer')
        set_option('io.excel.xlsx.writer', 'dummy')
        check_called(lambda: df.to_excel('something.xlsx'))
        check_called(lambda: df.to_excel('something.xls', engine='dummy'))
        set_option('io.excel.xlsx.writer', val)
Example #16
0
    def test_register_writer(self):
        # some awkward mocking to test out dispatch and such actually works
        called_save = []
        called_write_cells = []

        class DummyClass(ExcelWriter):
            called_save = False
            called_write_cells = False
            supported_extensions = ['test', 'xlsx', 'xls']
            engine = 'dummy'

            def save(self):
                called_save.append(True)

            def write_cells(self, *args, **kwargs):
                called_write_cells.append(True)

        def check_called(func):
            func()
            self.assert_(len(called_save) >= 1)
            self.assert_(len(called_write_cells) >= 1)
            del called_save[:]
            del called_write_cells[:]

        register_writer(DummyClass)
        writer = ExcelWriter('something.test')
        tm.assert_isinstance(writer, DummyClass)
        df = tm.makeCustomDataframe(1, 1)
        panel = tm.makePanel()
        func = lambda: df.to_excel('something.test')
        check_called(func)
        check_called(lambda: panel.to_excel('something.test'))
        from pandas import set_option, get_option
        val = get_option('io.excel.xlsx.writer')
        set_option('io.excel.xlsx.writer', 'dummy')
        check_called(lambda: df.to_excel('something.xlsx'))
        check_called(lambda: df.to_excel('something.xls', engine='dummy'))
        set_option('io.excel.xlsx.writer', val)
Example #17
0
    def test_to_csv_multiindex(self, float_frame, datetime_frame):

        frame = float_frame
        old_index = frame.index
        arrays = np.arange(len(old_index) * 2).reshape(2, -1)
        new_index = MultiIndex.from_arrays(arrays, names=["first", "second"])
        frame.index = new_index

        with tm.ensure_clean("__tmp_to_csv_multiindex__") as path:

            frame.to_csv(path, header=False)
            frame.to_csv(path, columns=["A", "B"])

            # round trip
            frame.to_csv(path)

            df = self.read_csv(path, index_col=[0, 1], parse_dates=False)

            # TODO to_csv drops column name
            tm.assert_frame_equal(frame, df, check_names=False)
            assert frame.index.names == df.index.names

            # needed if setUp becomes a class method
            float_frame.index = old_index

            # try multiindex with dates
            tsframe = datetime_frame
            old_index = tsframe.index
            new_index = [old_index, np.arange(len(old_index))]
            tsframe.index = MultiIndex.from_arrays(new_index)

            tsframe.to_csv(path, index_label=["time", "foo"])
            recons = self.read_csv(path, index_col=[0, 1])

            # TODO to_csv drops column name
            tm.assert_frame_equal(tsframe, recons, check_names=False)

            # do not load index
            tsframe.to_csv(path)
            recons = self.read_csv(path, index_col=None)
            assert len(recons.columns) == len(tsframe.columns) + 2

            # no index
            tsframe.to_csv(path, index=False)
            recons = self.read_csv(path, index_col=None)
            tm.assert_almost_equal(recons.values, datetime_frame.values)

            # needed if setUp becomes class method
            datetime_frame.index = old_index

        with tm.ensure_clean("__tmp_to_csv_multiindex__") as path:
            # GH3571, GH1651, GH3141

            def _make_frame(names=None):
                if names is True:
                    names = ["first", "second"]
                return DataFrame(
                    np.random.randint(0, 10, size=(3, 3)),
                    columns=MultiIndex.from_tuples([("bah", "foo"),
                                                    ("bah", "bar"),
                                                    ("ban", "baz")],
                                                   names=names),
                    dtype="int64",
                )

            # column & index are multi-index
            df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1])
            tm.assert_frame_equal(df, result)

            # column is mi
            df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=1, c_idx_nlevels=4)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=0)
            tm.assert_frame_equal(df, result)

            # dup column names?
            df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=3, c_idx_nlevels=4)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2])
            tm.assert_frame_equal(df, result)

            # writing with no index
            df = _make_frame()
            df.to_csv(path, index=False)
            result = read_csv(path, header=[0, 1])
            tm.assert_frame_equal(df, result)

            # we lose the names here
            df = _make_frame(True)
            df.to_csv(path, index=False)
            result = read_csv(path, header=[0, 1])
            assert com.all_none(*result.columns.names)
            result.columns.names = df.columns.names
            tm.assert_frame_equal(df, result)

            # whatsnew example
            df = _make_frame()
            df.to_csv(path)
            result = read_csv(path, header=[0, 1], index_col=[0])
            tm.assert_frame_equal(df, result)

            df = _make_frame(True)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1], index_col=[0])
            tm.assert_frame_equal(df, result)

            # invalid options
            df = _make_frame(True)
            df.to_csv(path)

            for i in [6, 7]:
                msg = "len of {i}, but only 5 lines in file".format(i=i)
                with pytest.raises(ParserError, match=msg):
                    read_csv(path, header=list(range(i)), index_col=0)

            # write with cols
            msg = "cannot specify cols with a MultiIndex"
            with pytest.raises(TypeError, match=msg):
                df.to_csv(path, columns=["foo", "bar"])

        with tm.ensure_clean("__tmp_to_csv_multiindex__") as path:
            # empty
            tsframe[:0].to_csv(path)
            recons = self.read_csv(path)

            exp = tsframe[:0]
            exp.index = []

            tm.assert_index_equal(recons.columns, exp.columns)
            assert len(recons) == 0
Example #18
0
def df(request):
    data_type = request.param

    if data_type == "delims":
        return pd.DataFrame({
            "a": ['"a,\t"b|c', "d\tef´"],
            "b": ["hi'j", "k''lm"]
        })
    elif data_type == "utf8":
        return pd.DataFrame({"a": ["µasd", "Ωœ∑´"], "b": ["øπ∆˚¬", "œ∑´®"]})
    elif data_type == "utf16":
        return pd.DataFrame({
            "a": ["\U0001f44d\U0001f44d", "\U0001f44d\U0001f44d"],
            "b": ["abc", "def"]
        })
    elif data_type == "string":
        return tm.makeCustomDataframe(5,
                                      3,
                                      c_idx_type="s",
                                      r_idx_type="i",
                                      c_idx_names=[None],
                                      r_idx_names=[None])
    elif data_type == "long":
        max_rows = get_option("display.max_rows")
        return tm.makeCustomDataframe(
            max_rows + 1,
            3,
            data_gen_f=lambda *args: randint(2),
            c_idx_type="s",
            r_idx_type="i",
            c_idx_names=[None],
            r_idx_names=[None],
        )
    elif data_type == "nonascii":
        return pd.DataFrame({
            "en": "in English".split(),
            "es": "en español".split()
        })
    elif data_type == "colwidth":
        _cw = get_option("display.max_colwidth") + 1
        return tm.makeCustomDataframe(
            5,
            3,
            data_gen_f=lambda *args: "x" * _cw,
            c_idx_type="s",
            r_idx_type="i",
            c_idx_names=[None],
            r_idx_names=[None],
        )
    elif data_type == "mixed":
        return DataFrame({
            "a": np.arange(1.0, 6.0) + 0.01,
            "b": np.arange(1, 6),
            "c": list("abcde")
        })
    elif data_type == "float":
        return tm.makeCustomDataframe(
            5,
            3,
            data_gen_f=lambda r, c: float(r) + 0.01,
            c_idx_type="s",
            r_idx_type="i",
            c_idx_names=[None],
            r_idx_names=[None],
        )
    elif data_type == "int":
        return tm.makeCustomDataframe(
            5,
            3,
            data_gen_f=lambda *args: randint(2),
            c_idx_type="s",
            r_idx_type="i",
            c_idx_names=[None],
            r_idx_names=[None],
        )
    else:
        raise ValueError
Example #19
0
    def test_header_multi_index(self):
        expected = tm.makeCustomDataframe(5,
                                          3,
                                          r_idx_nlevels=2,
                                          c_idx_nlevels=4)

        data = """\
C0,,C_l0_g0,C_l0_g1,C_l0_g2

C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
"""

        df = self.read_csv(StringIO(data),
                           header=[0, 1, 2, 3],
                           index_col=[0, 1],
                           tupleize_cols=False)
        tm.assert_frame_equal(df, expected)

        # skipping lines in the header
        df = self.read_csv(StringIO(data),
                           header=[0, 1, 2, 3],
                           index_col=[0, 1],
                           tupleize_cols=False)
        tm.assert_frame_equal(df, expected)

        # INVALID OPTIONS

        # no as_recarray
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            pytest.raises(ValueError,
                          self.read_csv,
                          StringIO(data),
                          header=[0, 1, 2, 3],
                          index_col=[0, 1],
                          as_recarray=True,
                          tupleize_cols=False)

        # names
        pytest.raises(ValueError,
                      self.read_csv,
                      StringIO(data),
                      header=[0, 1, 2, 3],
                      index_col=[0, 1],
                      names=['foo', 'bar'],
                      tupleize_cols=False)

        # usecols
        pytest.raises(ValueError,
                      self.read_csv,
                      StringIO(data),
                      header=[0, 1, 2, 3],
                      index_col=[0, 1],
                      usecols=['foo', 'bar'],
                      tupleize_cols=False)

        # non-numeric index_col
        pytest.raises(ValueError,
                      self.read_csv,
                      StringIO(data),
                      header=[0, 1, 2, 3],
                      index_col=['foo', 'bar'],
                      tupleize_cols=False)
Example #20
0
    def test_dups_fancy_indexing(self):

        # GH 3455

        df = tm.makeCustomDataframe(10, 3)
        df.columns = ["a", "a", "b"]
        result = df[["b", "a"]].columns
        expected = Index(["b", "a", "a"])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]],
                       columns=list("aaaaaaa"))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]])
        result.columns = list("aaaaaaa")

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {
                "test": [5, 7, 9, 11],
                "test1": [4.0, 5, 6, 7],
                "other": list("abcd")
            },
            index=["A", "A", "B", "C"],
        )
        rows = ["C", "B"]
        expected = DataFrame(
            {
                "test": [11, 9],
                "test1": [7.0, 6],
                "other": ["d", "c"]
            },
            index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ["C", "B", "E"]
        expected = DataFrame(
            {
                "test": [11, 9, np.nan],
                "test1": [7.0, 6, np.nan],
                "other": ["d", "c", np.nan],
            },
            index=rows,
        )

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ["F", "G", "H", "C", "B", "E"]
        expected = DataFrame(
            {
                "test": [np.nan, np.nan, np.nan, 11, 9, np.nan],
                "test1": [np.nan, np.nan, np.nan, 7.0, 6, np.nan],
                "other": [np.nan, np.nan, np.nan, "d", "c", np.nan],
            },
            index=rows,
        )
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # List containing only missing label
        dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD"))
        with pytest.raises(
                KeyError,
                match=re.escape(
                    "\"None of [Index(['E'], dtype='object')] are in the [index]\""
                ),
        ):
            dfnu.loc[["E"]]

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list("abc")})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ["a", np.nan, "a"]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"])
        expected = DataFrame({"test": [5, 7, 5, 7, np.nan]},
                             index=["A", "A", "A", "A", "E"])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[["A", "A", "E"]]
        tm.assert_frame_equal(result, expected)
Example #21
0
    pd.DataFrame({}),
    pd.DataFrame({"x": [1, 2, 3]}),
    pd.DataFrame({"x": [1.0, 2.0, 3.0]}),
    pd.DataFrame({0: [1, 2, 3]}),
    pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [4.0, 5.0, 6.0]}),
    pd.DataFrame({"x": [1.0, 2.0, 3.0]}, index=pd.Index([4, 5, 6], name="bar")),
    pd.Series([1.0, 2.0, 3.0]),
    pd.Series([1.0, 2.0, 3.0], name="foo"),
    pd.Series([1.0, 2.0, 3.0], name="foo", index=[4, 5, 6]),
    pd.Series([1.0, 2.0, 3.0], name="foo", index=pd.Index([4, 5, 6], name="bar")),
    pd.DataFrame({"x": ["a", "b", "c"]}),
    pd.DataFrame({"x": [b"a", b"b", b"c"]}),
    pd.DataFrame({"x": pd.Categorical(["a", "b", "a"], ordered=True)}),
    pd.DataFrame({"x": pd.Categorical(["a", "b", "a"], ordered=False)}),
    tm.makeCategoricalIndex(),
    tm.makeCustomDataframe(5, 3),
    tm.makeDataFrame(),
    tm.makeDateIndex(),
    tm.makeMissingDataframe(),
    tm.makeMixedDataFrame(),
    tm.makeObjectSeries(),
    tm.makePeriodFrame(),
    tm.makeRangeIndex(),
    tm.makeTimeDataFrame(),
    tm.makeTimeSeries(),
    tm.makeUnicodeIndex(),
]


@pytest.mark.parametrize("df", dfs)
def test_dumps_serialize_numpy(df):
Example #22
0
 def test_select_dtypes_typecodes(self):
     # GH 11990
     df = tm.makeCustomDataframe(30, 3, data_gen_f=lambda x, y: np.random.random())
     expected = df
     FLOAT_TYPES = list(np.typecodes["AllFloat"])
     tm.assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)
Example #23
0
    pd.DataFrame({'x': [1., 2., 3.]}),
    pd.DataFrame({0: [1, 2, 3]}),
    pd.DataFrame({'x': [1., 2., 3.], 'y': [4., 5., 6.]}),
    pd.DataFrame({'x': [1., 2., 3.]}, index=pd.Index([4, 5, 6], name='bar')),
    pd.Series([1., 2., 3.]),
    pd.Series([1., 2., 3.], name='foo'),
    pd.Series([1., 2., 3.], name='foo',
              index=[4, 5, 6]),
    pd.Series([1., 2., 3.], name='foo',
              index=pd.Index([4, 5, 6], name='bar')),
    pd.DataFrame({'x': ['a', 'b', 'c']}),
    pd.DataFrame({'x': [b'a', b'b', b'c']}),
    pd.DataFrame({'x': pd.Categorical(['a', 'b', 'a'], ordered=True)}),
    pd.DataFrame({'x': pd.Categorical(['a', 'b', 'a'], ordered=False)}),
    tm.makeCategoricalIndex(),
    tm.makeCustomDataframe(5, 3),
    tm.makeDataFrame(),
    tm.makeDateIndex(),
    tm.makeMissingDataframe(),
    tm.makeMixedDataFrame(),
    tm.makeObjectSeries(),
    tm.makePeriodFrame(),
    tm.makeRangeIndex(),
    tm.makeTimeDataFrame(),
    tm.makeTimeSeries(),
    tm.makeUnicodeIndex(),
]


@pytest.mark.parametrize('df', dfs)
def test_dumps_serialize_numpy(df):
Example #24
0
    def test_to_csv_moar(self):
        def _do_test(df,
                     r_dtype=None,
                     c_dtype=None,
                     rnlvl=None,
                     cnlvl=None,
                     dupe_col=False):

            kwargs = dict(parse_dates=False)
            if cnlvl:
                if rnlvl is not None:
                    kwargs["index_col"] = list(range(rnlvl))
                kwargs["header"] = list(range(cnlvl))

                with tm.ensure_clean("__tmp_to_csv_moar__") as path:
                    df.to_csv(path, encoding="utf8", chunksize=chunksize)
                    recons = self.read_csv(path, **kwargs)
            else:
                kwargs["header"] = 0

                with tm.ensure_clean("__tmp_to_csv_moar__") as path:
                    df.to_csv(path, encoding="utf8", chunksize=chunksize)
                    recons = self.read_csv(path, **kwargs)

            def _to_uni(x):
                if not isinstance(x, str):
                    return x.decode("utf8")
                return x

            if dupe_col:
                # read_Csv disambiguates the columns by
                # labeling them dupe.1,dupe.2, etc'. monkey patch columns
                recons.columns = df.columns
            if rnlvl and not cnlvl:
                delta_lvl = [
                    recons.iloc[:, i].values for i in range(rnlvl - 1)
                ]
                ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl)
                recons.index = ix
                recons = recons.iloc[:, rnlvl - 1:]

            type_map = dict(i="i", f="f", s="O", u="O", dt="O", p="O")
            if r_dtype:
                if r_dtype == "u":  # unicode
                    r_dtype = "O"
                    recons.index = np.array(
                        [_to_uni(label) for label in recons.index],
                        dtype=r_dtype)
                    df.index = np.array([_to_uni(label) for label in df.index],
                                        dtype=r_dtype)
                elif r_dtype == "dt":  # unicode
                    r_dtype = "O"
                    recons.index = np.array(
                        [Timestamp(label) for label in recons.index],
                        dtype=r_dtype)
                    df.index = np.array(
                        [Timestamp(label) for label in df.index],
                        dtype=r_dtype)
                elif r_dtype == "p":
                    r_dtype = "O"
                    idx_list = to_datetime(recons.index)
                    recons.index = np.array(
                        [Timestamp(label) for label in idx_list],
                        dtype=r_dtype)
                    df.index = np.array(list(
                        map(Timestamp, df.index.to_timestamp())),
                                        dtype=r_dtype)
                else:
                    r_dtype = type_map.get(r_dtype)
                    recons.index = np.array(recons.index, dtype=r_dtype)
                    df.index = np.array(df.index, dtype=r_dtype)
            if c_dtype:
                if c_dtype == "u":
                    c_dtype = "O"
                    recons.columns = np.array(
                        [_to_uni(label) for label in recons.columns],
                        dtype=c_dtype)
                    df.columns = np.array(
                        [_to_uni(label) for label in df.columns],
                        dtype=c_dtype)
                elif c_dtype == "dt":
                    c_dtype = "O"
                    recons.columns = np.array(
                        [Timestamp(label) for label in recons.columns],
                        dtype=c_dtype)
                    df.columns = np.array(
                        [Timestamp(label) for label in df.columns],
                        dtype=c_dtype)
                elif c_dtype == "p":
                    c_dtype = "O"
                    col_list = to_datetime(recons.columns)
                    recons.columns = np.array(
                        [Timestamp(label) for label in col_list],
                        dtype=c_dtype)
                    col_list = df.columns.to_timestamp()
                    df.columns = np.array(
                        [Timestamp(label) for label in col_list],
                        dtype=c_dtype)
                else:
                    c_dtype = type_map.get(c_dtype)
                    recons.columns = np.array(recons.columns, dtype=c_dtype)
                    df.columns = np.array(df.columns, dtype=c_dtype)

            tm.assert_frame_equal(df,
                                  recons,
                                  check_names=False,
                                  check_less_precise=True)

        N = 100
        chunksize = 1000

        for ncols in [4]:
            base = int((chunksize // ncols or 1) or 1)
            for nrows in [
                    2,
                    10,
                    N - 1,
                    N,
                    N + 1,
                    N + 2,
                    2 * N - 2,
                    2 * N - 1,
                    2 * N,
                    2 * N + 1,
                    2 * N + 2,
                    base - 1,
                    base,
                    base + 1,
            ]:
                _do_test(
                    tm.makeCustomDataframe(nrows,
                                           ncols,
                                           r_idx_type="dt",
                                           c_idx_type="s"),
                    "dt",
                    "s",
                )

        for ncols in [4]:
            base = int((chunksize // ncols or 1) or 1)
            for nrows in [
                    2,
                    10,
                    N - 1,
                    N,
                    N + 1,
                    N + 2,
                    2 * N - 2,
                    2 * N - 1,
                    2 * N,
                    2 * N + 1,
                    2 * N + 2,
                    base - 1,
                    base,
                    base + 1,
            ]:
                _do_test(
                    tm.makeCustomDataframe(nrows,
                                           ncols,
                                           r_idx_type="dt",
                                           c_idx_type="s"),
                    "dt",
                    "s",
                )
                pass

        for r_idx_type, c_idx_type in [("i", "i"), ("s", "s"), ("u", "dt"),
                                       ("p", "p")]:
            for ncols in [1, 2, 3, 4]:
                base = int((chunksize // ncols or 1) or 1)
                for nrows in [
                        2,
                        10,
                        N - 1,
                        N,
                        N + 1,
                        N + 2,
                        2 * N - 2,
                        2 * N - 1,
                        2 * N,
                        2 * N + 1,
                        2 * N + 2,
                        base - 1,
                        base,
                        base + 1,
                ]:
                    _do_test(
                        tm.makeCustomDataframe(nrows,
                                               ncols,
                                               r_idx_type=r_idx_type,
                                               c_idx_type=c_idx_type),
                        r_idx_type,
                        c_idx_type,
                    )

        for ncols in [1, 2, 3, 4]:
            base = int((chunksize // ncols or 1) or 1)
            for nrows in [
                    10,
                    N - 2,
                    N - 1,
                    N,
                    N + 1,
                    N + 2,
                    2 * N - 2,
                    2 * N - 1,
                    2 * N,
                    2 * N + 1,
                    2 * N + 2,
                    base - 1,
                    base,
                    base + 1,
            ]:
                _do_test(tm.makeCustomDataframe(nrows, ncols))

        for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]:
            df = tm.makeCustomDataframe(nrows, 3)
            cols = list(df.columns)
            cols[:2] = ["dupe", "dupe"]
            cols[-2:] = ["dupe", "dupe"]
            ix = list(df.index)
            ix[:2] = ["rdupe", "rdupe"]
            ix[-2:] = ["rdupe", "rdupe"]
            df.index = ix
            df.columns = cols
            _do_test(df, dupe_col=True)

        _do_test(DataFrame(index=np.arange(10)))
        _do_test(tm.makeCustomDataframe(chunksize // 2 + 1, 2,
                                        r_idx_nlevels=2),
                 rnlvl=2)
        for ncols in [2, 3, 4]:
            base = int(chunksize // ncols)
            for nrows in [
                    10,
                    N - 2,
                    N - 1,
                    N,
                    N + 1,
                    N + 2,
                    2 * N - 2,
                    2 * N - 1,
                    2 * N,
                    2 * N + 1,
                    2 * N + 2,
                    base - 1,
                    base,
                    base + 1,
            ]:
                _do_test(tm.makeCustomDataframe(nrows, ncols, r_idx_nlevels=2),
                         rnlvl=2)
                _do_test(tm.makeCustomDataframe(nrows, ncols, c_idx_nlevels=2),
                         cnlvl=2)
                _do_test(
                    tm.makeCustomDataframe(nrows,
                                           ncols,
                                           r_idx_nlevels=2,
                                           c_idx_nlevels=2),
                    rnlvl=2,
                    cnlvl=2,
                )
Example #25
0
def mkdf(rows, cols, colnames, **kwargs):
    df = pdtest.makeCustomDataframe(rows, cols, **kwargs)
    df.columns = colnames.split()
    return df