Example #1
0
    def test_unstack_fill_frame(self):

        # From a dataframe
        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
        df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
        df.index = MultiIndex.from_tuples(
            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

        result = df.unstack(fill_value=-1)

        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
        expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
        expected.columns = MultiIndex.from_tuples(
            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
        assert_frame_equal(result, expected)

        # From a mixed type dataframe
        df['A'] = df['A'].astype(np.int16)
        df['B'] = df['B'].astype(np.float64)

        result = df.unstack(fill_value=-1)
        expected['A'] = expected['A'].astype(np.int16)
        expected['B'] = expected['B'].astype(np.float64)
        assert_frame_equal(result, expected)

        # From a dataframe with incorrect data type for fill_value
        result = df.unstack(fill_value=0.5)

        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
        expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
        expected.columns = MultiIndex.from_tuples(
            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
        assert_frame_equal(result, expected)
def create_data():
    """ create the pickle/msgpack data """

    data = {
        'A': [0., 1., 2., 3., np.nan],
        'B': [0, 1, 0, 1, 0],
        'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
        'D': date_range('1/1/2009', periods=5),
        'E': [0., 1, Timestamp('20100101'), 'foo', 2.]
    }

    index = dict(int=Index(np.arange(10)),
                 date=date_range('20130101', periods=10),
                 period=period_range('2013-01-01', freq='M', periods=10))

    mi = dict(reg2=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                                                      ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])),
                                          names=['first', 'second']))
    series = dict(float=Series(data['A']),
                  int=Series(data['B']),
                  mixed=Series(data['E']),
                  ts=TimeSeries(np.arange(10).astype(np.int64), index=date_range('20130101',periods=10)),
                  mi=Series(np.arange(5).astype(np.float64),
                            index=MultiIndex.from_tuples(tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])),
                                                         names=['one', 'two'])),
                  dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']),
                  cat=Series(Categorical(['foo', 'bar', 'baz'])),
                  per=Series([Period('2000Q1')] * 5))

    mixed_dup_df = DataFrame(data)
    mixed_dup_df.columns = list("ABCDA")
    frame = dict(float=DataFrame(dict(A=series['float'], B=series['float'] + 1)),
                 int=DataFrame(dict(A=series['int'], B=series['int'] + 1)),
                 mixed=DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']])),
                 mi=DataFrame(dict(A=np.arange(5).astype(np.float64), B=np.arange(5).astype(np.int64)),
                              index=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'baz'],
                                                                       ['one', 'two', 'one', 'two', 'three']])),
                                                           names=['first', 'second'])),
                 dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
                               columns=['A', 'B', 'A']),
                 cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))),
                 cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']),
                                              B=np.arange(3).astype(np.int64))),
                 mixed_dup=mixed_dup_df)

    mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int']))
    mixed_dup_panel.items = ['ItemA', 'ItemA']
    panel = dict(float=Panel(dict(ItemA=frame['float'], ItemB=frame['float'] + 1)),
                 dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
                           items=['A', 'B', 'A']),
                 mixed_dup=mixed_dup_panel)

    return dict(series=series,
                frame=frame,
                panel=panel,
                index=index,
                mi=mi,
                sp_series=dict(float=_create_sp_series(),
                               ts=_create_sp_tsseries()),
                sp_frame=dict(float=_create_sp_frame()))
Example #3
0
    def test_mangles_multi_index(self):
        # See GH 18062
        data = """A,A,A,B\none,one,one,two\n0,40,34,0.1"""
        df = self.read_csv(StringIO(data), header=[0, 1])
        expected = DataFrame([[0, 40, 34, 0.1]],
                             columns=MultiIndex.from_tuples(
                                 [('A', 'one'), ('A', 'one.1'),
                                  ('A', 'one.2'), ('B', 'two')]))
        tm.assert_frame_equal(df, expected)

        data = """A,A,A,B\none,one,one.1,two\n0,40,34,0.1"""
        df = self.read_csv(StringIO(data), header=[0, 1])
        expected = DataFrame([[0, 40, 34, 0.1]],
                             columns=MultiIndex.from_tuples(
                                 [('A', 'one'), ('A', 'one.1'),
                                  ('A', 'one.1.1'), ('B', 'two')]))
        tm.assert_frame_equal(df, expected)

        data = """A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1"""
        df = self.read_csv(StringIO(data), header=[0, 1])
        expected = DataFrame([[0, 40, 34, 0.1, 0.1]],
                             columns=MultiIndex.from_tuples(
                                 [('A', 'one'), ('A', 'one.1'),
                                  ('A', 'one.1.1'), ('B', 'two'),
                                  ('B', 'two.1')]))
        tm.assert_frame_equal(df, expected)
Example #4
0
    def test_loc_getitem_int_slice(self):
        # GH 3053
        # loc should treat integer slices like label slices

        index = MultiIndex.from_tuples([t for t in itertools.product(
            [6, 7, 8], ['a', 'b'])])
        df = DataFrame(np.random.randn(6, 6), index, index)
        result = df.loc[6:8, :]
        expected = df
        tm.assert_frame_equal(result, expected)

        index = MultiIndex.from_tuples([t
                                        for t in itertools.product(
                                            [10, 20, 30], ['a', 'b'])])
        df = DataFrame(np.random.randn(6, 6), index, index)
        result = df.loc[20:30, :]
        expected = df.iloc[2:]
        tm.assert_frame_equal(result, expected)

        # doc examples
        result = df.loc[10, :]
        expected = df.iloc[0:2]
        expected.index = ['a', 'b']
        tm.assert_frame_equal(result, expected)

        result = df.loc[:, 10]
        expected = df[10]
        tm.assert_frame_equal(result, expected)
def create_data():
    """ create the pickle/msgpack data """

    data = {
        'A': [0., 1., 2., 3., np.nan],
        'B': [0, 1, 0, 1, 0],
        'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
        'D': date_range('1/1/2009', periods=5),
        'E': [0., 1, Timestamp('20100101'), 'foo', 2.]
    }

    scalars = dict(timestamp=Timestamp('20130101'))
    if LooseVersion(pandas.__version__) >= '0.17.0':
        scalars['period'] = Period('2012','M')

    index = dict(int=Index(np.arange(10)),
                 date=date_range('20130101', periods=10),
                 period=period_range('2013-01-01', freq='M', periods=10))

    mi = dict(reg2=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                                                      ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])),
                                          names=['first', 'second']))
    series = dict(float=Series(data['A']),
                  int=Series(data['B']),
                  mixed=Series(data['E']),
                  ts=Series(np.arange(10).astype(np.int64), index=date_range('20130101',periods=10)),
                  mi=Series(np.arange(5).astype(np.float64),
                            index=MultiIndex.from_tuples(tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])),
                                                         names=['one', 'two'])),
                  dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']),
<<<<<<< HEAD
<<<<<<< HEAD
                  cat=Series(Categorical(['foo', 'bar', 'baz'])))
    if LooseVersion(pandas.__version__) >= '0.17.0':
        series['period'] = Series([Period('2000Q1')] * 5)
Example #6
0
def test_get_info_after_update(chunkstore_lib):
    df = DataFrame(data={'data': [1.1, 2.1, 3.1]},
                   index=MultiIndex.from_tuples([(dt(2016, 1, 1), 1),
                                                 (dt(2016, 1, 2), 1),
                                                 (dt(2016, 1, 3), 1)],
                                                names=['date', 'id'])
                   )
    chunkstore_lib.write('test_df', df, 'D')
    df2 = DataFrame(data={'data': [1.1, 1.1, 1.1]},
                    index=MultiIndex.from_tuples([(dt(2016, 1, 1), 2),
                                                  (dt(2016, 1, 2), 2),
                                                  (dt(2016, 1, 4), 1)],
                                                 names=['date', 'id'])
                    )
    chunkstore_lib.update('test_df', df2)
    assert_frame_equal(chunkstore_lib.read('test_df'), pd.concat([df, df2]).sort())

    info = {'rows': 6,
            'dtype': [('date', '<M8[ns]'), ('id', '<i8'), ('data', '<f8')],
            'chunk_count': 4,
            'col_names': {u'index': [u'date', u'id'], u'index_tz': [None, None], u'columns': [u'data']},
            'type': u'df',
            'size': 144}

    assert(chunkstore_lib.get_info('test_df') == info)
Example #7
0
    def test_sort_index_multiindex(self, level):
        # GH13496

        # sort rows by specified level of multi-index
        mi = MultiIndex.from_tuples([
            [2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list('ABC'))
        df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi)

        expected_mi = MultiIndex.from_tuples([
            [1, 1, 1],
            [2, 1, 2],
            [2, 1, 3]], names=list('ABC'))
        expected = pd.DataFrame([
            [5, 6],
            [3, 4],
            [1, 2]], index=expected_mi)
        result = df.sort_index(level=level)
        assert_frame_equal(result, expected)

        # sort_remaining=False
        expected_mi = MultiIndex.from_tuples([
            [1, 1, 1],
            [2, 1, 3],
            [2, 1, 2]], names=list('ABC'))
        expected = pd.DataFrame([
            [5, 6],
            [1, 2],
            [3, 4]], index=expected_mi)
        result = df.sort_index(level=level, sort_remaining=False)
        assert_frame_equal(result, expected)
Example #8
0
    def test_unstack_fill_frame(self):

        # From a dataframe
        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
        df = DataFrame(rows, columns=list("AB"), dtype=np.int32)
        df.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")])

        result = df.unstack(fill_value=-1)

        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
        expected = DataFrame(rows, index=list("xyz"), dtype=np.int32)
        expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")])
        assert_frame_equal(result, expected)

        # From a mixed type dataframe
        df["A"] = df["A"].astype(np.int16)
        df["B"] = df["B"].astype(np.float64)

        result = df.unstack(fill_value=-1)
        expected["A"] = expected["A"].astype(np.int16)
        expected["B"] = expected["B"].astype(np.float64)
        assert_frame_equal(result, expected)

        # From a dataframe with incorrect data type for fill_value
        result = df.unstack(fill_value=0.5)

        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
        expected = DataFrame(rows, index=list("xyz"), dtype=np.float)
        expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")])
        assert_frame_equal(result, expected)
Example #9
0
def TreeMatrix(D,desc,L, Env=None,DshapeLarge=True):
    """
    Applying tree information (desc,L) on a given count matrix (D) and columns grouping (Env) to obtain a matrix of count over the tree
    """
    #I assume that D has correct columns name
    if not DshapeLarge:
        Z=(D[["Sample","Taxon"]]).values
        Z=MultiIndex.from_tuples(map(tuple,tuple(Z)), names=["Sample","Taxon"])
        D.index=Z
        Dlarge=D.Count.unstack(level=0)
        Dlarge.fillna(value=0,inplace=True)
        #I assume that Environment has correct index and columns names
        ExperimentalDesignColumns=MultiIndex.from_tuples(
        map(tuple,tuple(Env.ix[Dlarge.columns].values))
        , names=["Sample","Group"])
    else:
        # if D is already Large Environment information is already included
        Dlarge=D
        ExperimentalDesignColumns=Dlarge.columns
    #if taxon only present in tree but not in table, access mode .ix correctly report NA for that line, that later will be converted to zero.
    NodeTableLarge=[[x[0],Dlarge.ix[x[-1]].sum()] for x in desc]
    Dtree=DataFrame.from_items(NodeTableLarge).transpose()
    NodeAndLeafNamesIndex=MultiIndex.from_tuples(
        map(tuple,tuple(L.loc[:,["Name","Is_Leaf"]].ix[Dtree.index].values))
        , names=["Name","Is_Leaf"])
    Dtree.index=NodeAndLeafNamesIndex
    Dtree.columns=ExperimentalDesignColumns
    Dtree.columns=Dtree.columns.reorder_levels(["Group", "Sample"])
    return Dtree
Example #10
0
    def test_na_value_dict(self):
        data = """A,B,C
foo,bar,NA
bar,foo,foo
foo,bar,NA
bar,foo,foo"""

        df = read_csv(StringIO(data),
                      na_values={'A': ['foo'], 'B': ['bar']})
        expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'],
                              'B': [np.nan, 'foo', np.nan, 'foo'],
                              'C': [np.nan, 'foo', np.nan, 'foo']})
        assert_frame_equal(df, expected)

        data = """\
a,b,c,d
0,NA,1,5
"""
        xp = DataFrame({'b': [np.nan], 'c': [1], 'd': [5]}, index=[0])
        xp.index.name = 'a'
        df = read_csv(StringIO(data), na_values={}, index_col=0)
        assert_frame_equal(df, xp)

        xp = DataFrame({'b': [np.nan], 'd': [5]},
                       MultiIndex.from_tuples([(0, 1)]))
        df = read_csv(StringIO(data), na_values={}, index_col=[0, 2])
        assert_frame_equal(df, xp)

        xp = DataFrame({'b': [np.nan], 'd': [5]},
                       MultiIndex.from_tuples([(0, 1)]))
        df = read_csv(StringIO(data), na_values={}, index_col=['a', 'c'])
        assert_frame_equal(df, xp)
Example #11
0
def test_groupby_as_index_apply(df):
    # GH #4648 and #3417
    df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'user_id': [1, 2, 1, 1, 3, 1],
                    'time': range(6)})

    g_as = df.groupby('user_id', as_index=True)
    g_not_as = df.groupby('user_id', as_index=False)

    res_as = g_as.head(2).index
    res_not_as = g_not_as.head(2).index
    exp = Index([0, 1, 2, 4])
    tm.assert_index_equal(res_as, exp)
    tm.assert_index_equal(res_not_as, exp)

    res_as_apply = g_as.apply(lambda x: x.head(2)).index
    res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index

    # apply doesn't maintain the original ordering
    # changed in GH5610 as the as_index=False returns a MI here
    exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (
        2, 4)])
    tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
    exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None])

    tm.assert_index_equal(res_as_apply, exp_as_apply)
    tm.assert_index_equal(res_not_as_apply, exp_not_as_apply)

    ind = Index(list('abcde'))
    df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
    res = df.groupby(0, as_index=False).apply(lambda x: x).index
    tm.assert_index_equal(res, ind)
def create_data():
    """ create the pickle data """

    import numpy as np
    import pandas
    from pandas import (Series,TimeSeries,DataFrame,Panel,
                        SparseSeries,SparseTimeSeries,SparseDataFrame,SparsePanel,
                        Index,MultiIndex,PeriodIndex,
                        date_range,period_range,bdate_range,Timestamp)
    nan = np.nan

    data = {
        'A': [0., 1., 2., 3., np.nan],
        'B': [0, 1, 0, 1, 0],
        'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
        'D': date_range('1/1/2009', periods=5),
        'E' : [0., 1, Timestamp('20100101'),'foo',2.],
        }

    index = dict(int = Index(np.arange(10)),
                 date = date_range('20130101',periods=10),
                 period = period_range('2013-01-01', freq='M', periods=10))

    mi = dict(reg2 = MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                                                      ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])),
                                                 names=['first', 'second']))
    series = dict(float = Series(data['A']),
                  int = Series(data['B']),
                  mixed = Series(data['E']),
                  ts = TimeSeries(np.arange(10).astype(np.int64),index=date_range('20130101',periods=10)),
                  mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2],
                                                                                                    [3,4,3,4,5]])),
                                                                                           names=['one','two'])),
                  dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']))

    frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)),
                 int = DataFrame(dict(A = series['int']  , B = series['int']   + 1)),
                 mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])),
                 mi = DataFrame(dict(A = np.arange(5).astype(np.float64), B = np.arange(5).astype(np.int64)),
                                index=MultiIndex.from_tuples(tuple(zip(*[['bar','bar','baz','baz','baz'],
                                                                       ['one','two','one','two','three']])),
                                                             names=['first','second'])),
                 dup = DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
                                 columns=['A', 'B', 'A']))
    panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)),
                 dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
                             items=['A', 'B', 'A']))



    return dict( series = series,
                 frame = frame,
                 panel = panel,
                 index = index,
                 mi = mi,
                 sp_series = dict(float = _create_sp_series(),
                                  ts = _create_sp_tsseries()),
                 sp_frame = dict(float = _create_sp_frame())
                 )
Example #13
0
def test_getitem_bool_index_all(ind1, ind2):
    # GH#22533
    idx = MultiIndex.from_tuples([(10, 1), (20, 2), (30, 3),
                                  (40, 4), (50, 5)])
    tm.assert_index_equal(idx[ind1], idx)

    expected = MultiIndex.from_tuples([(10, 1), (30, 3)])
    tm.assert_index_equal(idx[ind2], expected)
Example #14
0
def test_drop(idx):
    dropped = idx.drop([('foo', 'two'), ('qux', 'one')])

    index = MultiIndex.from_tuples([('foo', 'two'), ('qux', 'one')])
    dropped2 = idx.drop(index)

    expected = idx[[0, 2, 3, 5]]
    tm.assert_index_equal(dropped, expected)
    tm.assert_index_equal(dropped2, expected)

    dropped = idx.drop(['bar'])
    expected = idx[[0, 1, 3, 4, 5]]
    tm.assert_index_equal(dropped, expected)

    dropped = idx.drop('foo')
    expected = idx[[2, 3, 4, 5]]
    tm.assert_index_equal(dropped, expected)

    index = MultiIndex.from_tuples([('bar', 'two')])
    with pytest.raises(KeyError, match=r"^10$"):
        idx.drop([('bar', 'two')])
    with pytest.raises(KeyError, match=r"^10$"):
        idx.drop(index)
    with pytest.raises(KeyError, match=r"^'two'$"):
        idx.drop(['foo', 'two'])

    # partially correct argument
    mixed_index = MultiIndex.from_tuples([('qux', 'one'), ('bar', 'two')])
    with pytest.raises(KeyError, match=r"^10$"):
        idx.drop(mixed_index)

    # error='ignore'
    dropped = idx.drop(index, errors='ignore')
    expected = idx[[0, 1, 2, 3, 4, 5]]
    tm.assert_index_equal(dropped, expected)

    dropped = idx.drop(mixed_index, errors='ignore')
    expected = idx[[0, 1, 2, 3, 5]]
    tm.assert_index_equal(dropped, expected)

    dropped = idx.drop(['foo', 'two'], errors='ignore')
    expected = idx[[2, 3, 4, 5]]
    tm.assert_index_equal(dropped, expected)

    # mixed partial / full drop
    dropped = idx.drop(['foo', ('qux', 'one')])
    expected = idx[[2, 3, 5]]
    tm.assert_index_equal(dropped, expected)

    # mixed partial / full drop / error='ignore'
    mixed_index = ['foo', ('qux', 'one'), 'two']
    with pytest.raises(KeyError, match=r"^'two'$"):
        idx.drop(mixed_index)
    dropped = idx.drop(mixed_index, errors='ignore')
    expected = idx[[2, 3, 5]]
    tm.assert_index_equal(dropped, expected)
Example #15
0
    def test_stack_datetime_column_multiIndex(self):
        # GH 8039
        t = datetime(2014, 1, 1)
        df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")]))
        result = df.stack()

        eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)])
        ecols = MultiIndex.from_tuples([(t, "A")])
        expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols)
        assert_frame_equal(result, expected)
Example #16
0
def test_boolean_context_compat2():

    # boolean context compat
    # GH7897
    i1 = MultiIndex.from_tuples([('A', 1), ('A', 2)])
    i2 = MultiIndex.from_tuples([('A', 1), ('A', 3)])
    common = i1.intersection(i2)

    with pytest.raises(ValueError):
        bool(common)
Example #17
0
def test_indexing_ambiguity_bug_1678():
    # GH 1678
    columns = MultiIndex.from_tuples(
        [('Ohio', 'Green'), ('Ohio', 'Red'), ('Colorado', 'Green')])
    index = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

    df = DataFrame(np.arange(12).reshape((4, 3)), index=index, columns=columns)

    result = df.iloc[:, 1]
    expected = df.loc[:, ('Ohio', 'Red')]
    tm.assert_series_equal(result, expected)
Example #18
0
    def test_multiindex_get(self):
        ind = MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["first", "second"])
        wp = Panel(np.random.random((4, 5, 5)), items=ind, major_axis=np.arange(5), minor_axis=np.arange(5))
        f1 = wp["a"]
        f2 = wp.ix["a"]
        assert_panel_equal(f1, f2)

        self.assert_((f1.items == [1, 2]).all())
        self.assert_((f2.items == [1, 2]).all())

        ind = MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)], names=["first", "second"])
Example #19
0
def test_from_tuples():
    msg = 'Cannot infer number of levels from empty list'
    with pytest.raises(TypeError, match=msg):
        MultiIndex.from_tuples([])

    expected = MultiIndex(levels=[[1, 3], [2, 4]],
                          codes=[[0, 1], [0, 1]],
                          names=['a', 'b'])

    # input tuples
    result = MultiIndex.from_tuples(((1, 2), (3, 4)), names=['a', 'b'])
    tm.assert_index_equal(result, expected)
Example #20
0
    def test_sort_index_multiindex(self):
        # GH13496

        # sort rows by specified level of multi-index
        mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC'))
        df = DataFrame([[1, 2], [3, 4]], mi)

        # MI sort, but no level: sort_level has no effect
        mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
        df = DataFrame([[1, 2], [3, 4]], mi)
        result = df.sort_index(sort_remaining=False)
        expected = df.sort_index()
        assert_frame_equal(result, expected)
Example #21
0
    def test_indexing_ambiguity_bug_1678(self):
        columns = MultiIndex.from_tuples([('Ohio', 'Green'), ('Ohio', 'Red'), (
            'Colorado', 'Green')])
        index = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)
                                        ])

        frame = DataFrame(np.arange(12).reshape((4, 3)), index=index,
                          columns=columns)

        result = frame.iloc[:, 1]
        exp = frame.loc[:, ('Ohio', 'Red')]
        assert isinstance(result, Series)
        tm.assert_series_equal(result, exp)
Example #22
0
    def test_subclass_stack_multi_mixed(self):
        # GH 15564
        df = tm.SubclassedDataFrame([
            [10, 11, 12.0, 13.0],
            [20, 21, 22.0, 23.0],
            [30, 31, 32.0, 33.0],
            [40, 41, 42.0, 43.0]],
            index=MultiIndex.from_tuples(
                list(zip(list('AABB'), list('cdcd'))),
                names=['aaa', 'ccc']),
            columns=MultiIndex.from_tuples(
                list(zip(list('WWXX'), list('yzyz'))),
                names=['www', 'yyy']))

        exp = tm.SubclassedDataFrame([
            [10, 12.0],
            [11, 13.0],
            [20, 22.0],
            [21, 23.0],
            [30, 32.0],
            [31, 33.0],
            [40, 42.0],
            [41, 43.0]],
            index=MultiIndex.from_tuples(list(zip(
                list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))),
                names=['aaa', 'ccc', 'yyy']),
            columns=Index(['W', 'X'], name='www'))

        res = df.stack()
        tm.assert_frame_equal(res, exp)

        res = df.stack('yyy')
        tm.assert_frame_equal(res, exp)

        exp = tm.SubclassedDataFrame([
            [10.0, 11.0],
            [12.0, 13.0],
            [20.0, 21.0],
            [22.0, 23.0],
            [30.0, 31.0],
            [32.0, 33.0],
            [40.0, 41.0],
            [42.0, 43.0]],
            index=MultiIndex.from_tuples(list(zip(
                list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))),
                names=['aaa', 'ccc', 'www']),
            columns=Index(['y', 'z'], name='yyy'))

        res = df.stack('www')
        tm.assert_frame_equal(res, exp)
def test_index_equal_values_mismatch(check_exact):
    msg = """MultiIndex level \\[1\\] are different

MultiIndex level \\[1\\] values are different \\(25\\.0 %\\)
\\[left\\]:  Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\)
\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)"""

    idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2),
                                   ("B", 3), ("B", 4)])
    idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2),
                                   ("B", 3), ("B", 4)])

    with pytest.raises(AssertionError, match=msg):
        assert_index_equal(idx1, idx2, check_exact=check_exact)
Example #24
0
def test_from_tuples_iterator():
    # GH 18434
    # input iterator for tuples
    expected = MultiIndex(levels=[[1, 3], [2, 4]],
                          labels=[[0, 1], [0, 1]],
                          names=['a', 'b'])

    result = MultiIndex.from_tuples(zip([1, 3], [2, 4]), names=['a', 'b'])
    tm.assert_index_equal(result, expected)

    # input non-iterables
    with tm.assert_raises_regex(
            TypeError, 'Input must be a list / sequence of tuple-likes.'):
        MultiIndex.from_tuples(0)
Example #25
0
    def test_iloc_getitem_panel_multiindex(self):

        with catch_warnings(record=True):

            # GH 7199
            # Panel with multi-index
            multi_index = MultiIndex.from_tuples([('ONE', 'one'),
                                                  ('TWO', 'two'),
                                                  ('THREE', 'three')],
                                                 names=['UPPER', 'lower'])

            simple_index = [x[0] for x in multi_index]
            wd1 = Panel(items=['First', 'Second'],
                        major_axis=['a', 'b', 'c', 'd'],
                        minor_axis=multi_index)

            wd2 = Panel(items=['First', 'Second'],
                        major_axis=['a', 'b', 'c', 'd'],
                        minor_axis=simple_index)

            expected1 = wd1['First'].iloc[[True, True, True, False], [0, 2]]
            result1 = wd1.iloc[0, [True, True, True, False], [0, 2]]  # WRONG
            tm.assert_frame_equal(result1, expected1)

            expected2 = wd2['First'].iloc[[True, True, True, False], [0, 2]]
            result2 = wd2.iloc[0, [True, True, True, False], [0, 2]]
            tm.assert_frame_equal(result2, expected2)

            expected1 = DataFrame(index=['a'], columns=multi_index,
                                  dtype='float64')
            result1 = wd1.iloc[0, [0], [0, 1, 2]]
            tm.assert_frame_equal(result1, expected1)

            expected2 = DataFrame(index=['a'], columns=simple_index,
                                  dtype='float64')
            result2 = wd2.iloc[0, [0], [0, 1, 2]]
            tm.assert_frame_equal(result2, expected2)

            # GH 7516
            mi = MultiIndex.from_tuples([(0, 'x'), (1, 'y'), (2, 'z')])
            p = Panel(np.arange(3 * 3 * 3, dtype='int64').reshape(3, 3, 3),
                      items=['a', 'b', 'c'], major_axis=mi,
                      minor_axis=['u', 'v', 'w'])
            result = p.iloc[:, 1, 0]
            expected = Series([3, 12, 21], index=['a', 'b', 'c'], name='u')
            tm.assert_series_equal(result, expected)

            result = p.loc[:, (1, 'y'), 'u']
            tm.assert_series_equal(result, expected)
Example #26
0
    def test_dateparser_resolution_if_not_ns(self):
        # GH 10245
        data = """\
date,time,prn,rxstatus
2013-11-03,19:00:00,126,00E80000
2013-11-03,19:00:00,23,00E80000
2013-11-03,19:00:00,13,00E80000
"""

        def date_parser(date, time):
            datetime = np_array_datetime64_compat(
                date + 'T' + time + 'Z', dtype='datetime64[s]')
            return datetime

        df = self.read_csv(StringIO(data), date_parser=date_parser,
                           parse_dates={'datetime': ['date', 'time']},
                           index_col=['datetime', 'prn'])

        datetimes = np_array_datetime64_compat(['2013-11-03T19:00:00Z'] * 3,
                                               dtype='datetime64[s]')
        df_correct = DataFrame(data={'rxstatus': ['00E80000'] * 3},
                               index=MultiIndex.from_tuples(
                                   [(datetimes[0], 126),
                                    (datetimes[1], 23),
                                    (datetimes[2], 13)],
                               names=['datetime', 'prn']))
        tm.assert_frame_equal(df, df_correct)
Example #27
0
    def test_iloc_getitem_multiindex2(self):
        # TODO(wesm): fix this
        pytest.skip('this test was being suppressed, '
                    'needs to be fixed')

        arr = np.random.randn(3, 3)
        df = DataFrame(arr, columns=[[2, 2, 4], [6, 8, 10]],
                       index=[[4, 4, 8], [8, 10, 12]])

        rs = df.iloc[2]
        xp = Series(arr[2], index=df.columns)
        tm.assert_series_equal(rs, xp)

        rs = df.iloc[:, 2]
        xp = Series(arr[:, 2], index=df.index)
        tm.assert_series_equal(rs, xp)

        rs = df.iloc[2, 2]
        xp = df.values[2, 2]
        assert rs == xp

        # for multiple items
        # GH 5528
        rs = df.iloc[[0, 1]]
        xp = df.xs(4, drop_level=False)
        tm.assert_frame_equal(rs, xp)

        tup = zip(*[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']])
        index = MultiIndex.from_tuples(tup)
        df = DataFrame(np.random.randn(4, 4), index=index)
        rs = df.iloc[[2, 3]]
        xp = df.xs('b', drop_level=False)
        tm.assert_frame_equal(rs, xp)
Example #28
0
 def test_constructor_dict_of_tuples(self):
     data = {(1, 2): 3,
             (None, 5): 6}
     result = Series(data).sort_values()
     expected = Series([3, 6],
                       index=MultiIndex.from_tuples([(1, 2), (None, 5)]))
     tm.assert_series_equal(result, expected)
    def test_boxplot_legacy(self):
        grouped = self.hist_df.groupby(by='gender')
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            axes = _check_plot_works(grouped.boxplot, return_type='axes')
        self._check_axes_shape(list(axes.values()), axes_num=2, layout=(1, 2))

        axes = _check_plot_works(grouped.boxplot, subplots=False,
                                 return_type='axes')
        self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
        tuples = lzip(string.ascii_letters[:10], range(10))
        df = DataFrame(np.random.rand(10, 3),
                       index=MultiIndex.from_tuples(tuples))

        grouped = df.groupby(level=1)
        axes = _check_plot_works(grouped.boxplot, return_type='axes')
        self._check_axes_shape(list(axes.values()), axes_num=10, layout=(4, 3))

        axes = _check_plot_works(grouped.boxplot, subplots=False,
                                 return_type='axes')
        self._check_axes_shape(axes, axes_num=1, layout=(1, 1))

        grouped = df.unstack(level=1).groupby(level=0, axis=1)
        axes = _check_plot_works(grouped.boxplot, return_type='axes')
        self._check_axes_shape(list(axes.values()), axes_num=3, layout=(2, 2))

        axes = _check_plot_works(grouped.boxplot, subplots=False,
                                 return_type='axes')
        self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
    def test_drop_multiindex_not_lexsorted(self):
        # GH 11640

        # define the lexsorted version
        lexsorted_mi = MultiIndex.from_tuples(
            [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
        lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
        self.assertTrue(lexsorted_df.columns.is_lexsorted())

        # define the non-lexsorted version
        not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
                                     data=[[1, 'b1', 'c1', 3],
                                           [1, 'b2', 'c2', 4]])
        not_lexsorted_df = not_lexsorted_df.pivot_table(
            index='a', columns=['b', 'c'], values='d')
        not_lexsorted_df = not_lexsorted_df.reset_index()
        self.assertFalse(not_lexsorted_df.columns.is_lexsorted())

        # compare the results
        tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

        expected = lexsorted_df.drop('a', axis=1)
        with tm.assert_produces_warning(PerformanceWarning):
            result = not_lexsorted_df.drop('a', axis=1)

        tm.assert_frame_equal(result, expected)
Example #31
0
def test_comprehensive(df_ext, environment):
    # test as many low level features simultaneously as possible
    cidx = MultiIndex.from_tuples([("Z", "a"), ("Z", "b"), ("Y", "c")])
    ridx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "c")])
    df_ext.index, df_ext.columns = ridx, cidx
    stlr = df_ext.style
    stlr.set_caption("mycap")
    stlr.set_table_styles([
        {
            "selector": "label",
            "props": ":{fig§item}"
        },
        {
            "selector": "position",
            "props": ":h!"
        },
        {
            "selector": "position_float",
            "props": ":centering"
        },
        {
            "selector": "column_format",
            "props": ":rlrlr"
        },
        {
            "selector": "toprule",
            "props": ":toprule"
        },
        {
            "selector": "midrule",
            "props": ":midrule"
        },
        {
            "selector": "bottomrule",
            "props": ":bottomrule"
        },
        {
            "selector": "rowcolors",
            "props": ":{3}{pink}{}"
        },  # custom command
    ])
    stlr.highlight_max(axis=0,
                       props="textbf:--rwrap;cellcolor:[rgb]{1,1,0.6}--rwrap")
    stlr.highlight_max(axis=None,
                       props="Huge:--wrap;",
                       subset=[("Z", "a"), ("Z", "b")])

    expected = ("""\
\\begin{table}[h!]
\\centering
\\caption{mycap}
\\label{fig:item}
\\rowcolors{3}{pink}{}
\\begin{tabular}{rlrlr}
\\toprule
 &  & \\multicolumn{2}{r}{Z} & Y \\\\
 &  & a & b & c \\\\
\\midrule
\\multirow[c]{2}{*}{A} & a & 0 & \\textbf{\\cellcolor[rgb]{1,1,0.6}{-0.61}} & ab \\\\
 & b & 1 & -1.22 & cd \\\\
B & c & \\textbf{\\cellcolor[rgb]{1,1,0.6}{{\\Huge 2}}} & -2.22 & """
                """\
\\textbf{\\cellcolor[rgb]{1,1,0.6}{de}} \\\\
\\bottomrule
\\end{tabular}
\\end{table}
""").replace("table", environment if environment else "table")
    result = stlr.format(precision=2).to_latex(environment=environment)
    assert result == expected
Example #32
0
def test_sortlevel_not_sort_remaining():
    mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC"))
    sorted_idx, _ = mi.sortlevel("A", sort_remaining=False)
    assert sorted_idx.equals(mi)
Example #33
0
    )
    result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes
    expected = Series(
        [np.dtype("datetime64[ns]"), object, object, np.int64, object],
        index=["observation", "color", "mood", "intensity", "score"],
    )
    tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
    "index",
    [
        pd.CategoricalIndex(list("abc")),
        pd.interval_range(0, 3),
        pd.period_range("2020", periods=3, freq="D"),
        MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
    ],
)
def test_apply_index_has_complex_internals(index):
    # GH 31248
    df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
    result = df.groupby("group").apply(lambda x: x)
    tm.assert_frame_equal(result, df)


@pytest.mark.parametrize(
    "function, expected_values",
    [
        (lambda x: x.index.to_list(), [[0, 1], [2, 3]]),
        (lambda x: set(x.index.to_list()), [{0, 1}, {2, 3}]),
        (lambda x: tuple(x.index.to_list()), [(0, 1), (2, 3)]),
Example #34
0
 def test_rename_mi(self):
     s = Series(
         [11, 21, 31],
         index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]),
     )
     s.rename(str.lower)
Example #35
0
def test_get_indexer_nearest():
    midx = MultiIndex.from_tuples([('a', 1), ('b', 2)])
    with pytest.raises(NotImplementedError):
        midx.get_indexer(['a'], method='nearest')
    with pytest.raises(NotImplementedError):
        midx.get_indexer(['a'], method='pad', tolerance=2)
Example #36
0
def test_where():
    i = MultiIndex.from_tuples([('A', 1), ('A', 2)])

    msg = r"\.where is not supported for MultiIndex operations"
    with pytest.raises(NotImplementedError, match=msg):
        i.where(True)
Example #37
0
def test_from_tuples_index_values(idx):
    result = MultiIndex.from_tuples(idx)
    assert (result.values == idx.values).all()
Example #38
0
def test_from_tuples_empty():
    # GH 16777
    result = MultiIndex.from_tuples([], names=["a", "b"])
    expected = MultiIndex.from_arrays(arrays=[[], []], names=["a", "b"])
    tm.assert_index_equal(result, expected)
Example #39
0
    def test_sort_index_nan_multiindex(self):
        # GH#14784
        # incorrect sorting w.r.t. nans
        tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]]
        mi = MultiIndex.from_tuples(tuples)

        df = DataFrame(np.arange(16).reshape(4, 4),
                       index=mi,
                       columns=list("ABCD"))
        s = Series(np.arange(4), index=mi)

        df2 = DataFrame({
            "date":
            pd.DatetimeIndex([
                "20121002",
                "20121007",
                "20130130",
                "20130202",
                "20130305",
                "20121002",
                "20121207",
                "20130130",
                "20130202",
                "20130305",
                "20130202",
                "20130305",
            ]),
            "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
            "whole_cost": [
                1790,
                np.nan,
                280,
                259,
                np.nan,
                623,
                90,
                312,
                np.nan,
                301,
                359,
                801,
            ],
            "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12],
        }).set_index(["date", "user_id"])

        # sorting frame, default nan position is last
        result = df.sort_index()
        expected = df.iloc[[3, 0, 2, 1], :]
        tm.assert_frame_equal(result, expected)

        # sorting frame, nan position last
        result = df.sort_index(na_position="last")
        expected = df.iloc[[3, 0, 2, 1], :]
        tm.assert_frame_equal(result, expected)

        # sorting frame, nan position first
        result = df.sort_index(na_position="first")
        expected = df.iloc[[1, 2, 3, 0], :]
        tm.assert_frame_equal(result, expected)

        # sorting frame with removed rows
        result = df2.dropna().sort_index()
        expected = df2.sort_index().dropna()
        tm.assert_frame_equal(result, expected)

        # sorting series, default nan position is last
        result = s.sort_index()
        expected = s.iloc[[3, 0, 2, 1]]
        tm.assert_series_equal(result, expected)

        # sorting series, nan position last
        result = s.sort_index(na_position="last")
        expected = s.iloc[[3, 0, 2, 1]]
        tm.assert_series_equal(result, expected)

        # sorting series, nan position first
        result = s.sort_index(na_position="first")
        expected = s.iloc[[1, 2, 3, 0]]
        tm.assert_series_equal(result, expected)
Example #40
0
class TestDataFrameSortIndex:
    def test_sort_index_and_reconstruction_doc_example(self):
        # doc example
        df = DataFrame(
            {"value": [1, 2, 3, 4]},
            index=MultiIndex(levels=[["a", "b"], ["bb", "aa"]],
                             codes=[[0, 0, 1, 1], [0, 1, 0, 1]]),
        )
        assert df.index._is_lexsorted()
        assert not df.index.is_monotonic

        # sort it
        expected = DataFrame(
            {"value": [2, 1, 4, 3]},
            index=MultiIndex(levels=[["a", "b"], ["aa", "bb"]],
                             codes=[[0, 0, 1, 1], [0, 1, 0, 1]]),
        )
        result = df.sort_index()
        assert result.index.is_monotonic

        tm.assert_frame_equal(result, expected)

        # reconstruct
        result = df.sort_index().copy()
        result.index = result.index._sort_levels_monotonic()
        assert result.index.is_monotonic

        tm.assert_frame_equal(result, expected)

    def test_sort_index_non_existent_label_multiindex(self):
        # GH#12261
        df = DataFrame(0, columns=[], index=MultiIndex.from_product([[], []]))
        df.loc["b", "2"] = 1
        df.loc["a", "3"] = 1
        result = df.sort_index().index.is_monotonic
        assert result is True

    def test_sort_index_reorder_on_ops(self):
        # GH#15687
        df = DataFrame(
            np.random.randn(8, 2),
            index=MultiIndex.from_product(
                [["a", "b"], ["big", "small"], ["red", "blu"]],
                names=["letter", "size", "color"],
            ),
            columns=["near", "far"],
        )
        df = df.sort_index()

        def my_func(group):
            group.index = ["newz", "newa"]
            return group

        result = df.groupby(
            level=["letter", "size"]).apply(my_func).sort_index()
        expected = MultiIndex.from_product(
            [["a", "b"], ["big", "small"], ["newa", "newz"]],
            names=["letter", "size", None],
        )

        tm.assert_index_equal(result.index, expected)

    def test_sort_index_nan_multiindex(self):
        # GH#14784
        # incorrect sorting w.r.t. nans
        tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]]
        mi = MultiIndex.from_tuples(tuples)

        df = DataFrame(np.arange(16).reshape(4, 4),
                       index=mi,
                       columns=list("ABCD"))
        s = Series(np.arange(4), index=mi)

        df2 = DataFrame({
            "date":
            pd.DatetimeIndex([
                "20121002",
                "20121007",
                "20130130",
                "20130202",
                "20130305",
                "20121002",
                "20121207",
                "20130130",
                "20130202",
                "20130305",
                "20130202",
                "20130305",
            ]),
            "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
            "whole_cost": [
                1790,
                np.nan,
                280,
                259,
                np.nan,
                623,
                90,
                312,
                np.nan,
                301,
                359,
                801,
            ],
            "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12],
        }).set_index(["date", "user_id"])

        # sorting frame, default nan position is last
        result = df.sort_index()
        expected = df.iloc[[3, 0, 2, 1], :]
        tm.assert_frame_equal(result, expected)

        # sorting frame, nan position last
        result = df.sort_index(na_position="last")
        expected = df.iloc[[3, 0, 2, 1], :]
        tm.assert_frame_equal(result, expected)

        # sorting frame, nan position first
        result = df.sort_index(na_position="first")
        expected = df.iloc[[1, 2, 3, 0], :]
        tm.assert_frame_equal(result, expected)

        # sorting frame with removed rows
        result = df2.dropna().sort_index()
        expected = df2.sort_index().dropna()
        tm.assert_frame_equal(result, expected)

        # sorting series, default nan position is last
        result = s.sort_index()
        expected = s.iloc[[3, 0, 2, 1]]
        tm.assert_series_equal(result, expected)

        # sorting series, nan position last
        result = s.sort_index(na_position="last")
        expected = s.iloc[[3, 0, 2, 1]]
        tm.assert_series_equal(result, expected)

        # sorting series, nan position first
        result = s.sort_index(na_position="first")
        expected = s.iloc[[1, 2, 3, 0]]
        tm.assert_series_equal(result, expected)

    def test_sort_index_nan(self):
        # GH#3917

        # Test DataFrame with nan label
        df = DataFrame(
            {
                "A": [1, 2, np.nan, 1, 6, 8, 4],
                "B": [9, np.nan, 5, 2, 5, 4, 5]
            },
            index=[1, 2, 3, 4, 5, 6, np.nan],
        )

        # NaN label, ascending=True, na_position='last'
        sorted_df = df.sort_index(kind="quicksort",
                                  ascending=True,
                                  na_position="last")
        expected = DataFrame(
            {
                "A": [1, 2, np.nan, 1, 6, 8, 4],
                "B": [9, np.nan, 5, 2, 5, 4, 5]
            },
            index=[1, 2, 3, 4, 5, 6, np.nan],
        )
        tm.assert_frame_equal(sorted_df, expected)

        # NaN label, ascending=True, na_position='first'
        sorted_df = df.sort_index(na_position="first")
        expected = DataFrame(
            {
                "A": [4, 1, 2, np.nan, 1, 6, 8],
                "B": [5, 9, np.nan, 5, 2, 5, 4]
            },
            index=[np.nan, 1, 2, 3, 4, 5, 6],
        )
        tm.assert_frame_equal(sorted_df, expected)

        # NaN label, ascending=False, na_position='last'
        sorted_df = df.sort_index(kind="quicksort", ascending=False)
        expected = DataFrame(
            {
                "A": [8, 6, 1, np.nan, 2, 1, 4],
                "B": [4, 5, 2, 5, np.nan, 9, 5]
            },
            index=[6, 5, 4, 3, 2, 1, np.nan],
        )
        tm.assert_frame_equal(sorted_df, expected)

        # NaN label, ascending=False, na_position='first'
        sorted_df = df.sort_index(kind="quicksort",
                                  ascending=False,
                                  na_position="first")
        expected = DataFrame(
            {
                "A": [4, 8, 6, 1, np.nan, 2, 1],
                "B": [5, 4, 5, 2, 5, np.nan, 9]
            },
            index=[np.nan, 6, 5, 4, 3, 2, 1],
        )
        tm.assert_frame_equal(sorted_df, expected)

    def test_sort_index_multi_index(self):
        # GH#25775, testing that sorting by index works with a multi-index.
        df = DataFrame({
            "a": [3, 1, 2],
            "b": [0, 0, 0],
            "c": [0, 1, 2],
            "d": list("abc")
        })
        result = df.set_index(list("abc")).sort_index(level=list("ba"))

        expected = DataFrame({
            "a": [1, 2, 3],
            "b": [0, 0, 0],
            "c": [1, 2, 0],
            "d": list("bca")
        })
        expected = expected.set_index(list("abc"))

        tm.assert_frame_equal(result, expected)

    def test_sort_index_inplace(self):
        frame = DataFrame(np.random.randn(4, 4),
                          index=[1, 2, 3, 4],
                          columns=["A", "B", "C", "D"])

        # axis=0
        unordered = frame.loc[[3, 2, 4, 1]]
        a_id = id(unordered["A"])
        df = unordered.copy()
        return_value = df.sort_index(inplace=True)
        assert return_value is None
        expected = frame
        tm.assert_frame_equal(df, expected)
        assert a_id != id(df["A"])

        df = unordered.copy()
        return_value = df.sort_index(ascending=False, inplace=True)
        assert return_value is None
        expected = frame[::-1]
        tm.assert_frame_equal(df, expected)

        # axis=1
        unordered = frame.loc[:, ["D", "B", "C", "A"]]
        df = unordered.copy()
        return_value = df.sort_index(axis=1, inplace=True)
        assert return_value is None
        expected = frame
        tm.assert_frame_equal(df, expected)

        df = unordered.copy()
        return_value = df.sort_index(axis=1, ascending=False, inplace=True)
        assert return_value is None
        expected = frame.iloc[:, ::-1]
        tm.assert_frame_equal(df, expected)

    def test_sort_index_different_sortorder(self):
        A = np.arange(20).repeat(5)
        B = np.tile(np.arange(5), 20)

        indexer = np.random.permutation(100)
        A = A.take(indexer)
        B = B.take(indexer)

        df = DataFrame({"A": A, "B": B, "C": np.random.randn(100)})

        ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
        expected = df.take(ex_indexer)

        # test with multiindex, too
        idf = df.set_index(["A", "B"])

        result = idf.sort_index(ascending=[1, 0])
        expected = idf.take(ex_indexer)
        tm.assert_frame_equal(result, expected)

        # also, Series!
        result = idf["C"].sort_index(ascending=[1, 0])
        tm.assert_series_equal(result, expected["C"])

    def test_sort_index_level(self):
        mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC"))
        df = DataFrame([[1, 2], [3, 4]], mi)

        result = df.sort_index(level="A", sort_remaining=False)
        expected = df
        tm.assert_frame_equal(result, expected)

        result = df.sort_index(level=["A", "B"], sort_remaining=False)
        expected = df
        tm.assert_frame_equal(result, expected)

        # Error thrown by sort_index when
        # first index is sorted last (GH#26053)
        result = df.sort_index(level=["C", "B", "A"])
        expected = df.iloc[[1, 0]]
        tm.assert_frame_equal(result, expected)

        result = df.sort_index(level=["B", "C", "A"])
        expected = df.iloc[[1, 0]]
        tm.assert_frame_equal(result, expected)

        result = df.sort_index(level=["C", "A"])
        expected = df.iloc[[1, 0]]
        tm.assert_frame_equal(result, expected)

    def test_sort_index_categorical_index(self):

        df = DataFrame({
            "A":
            np.arange(6, dtype="int64"),
            "B":
            Series(list("aabbca")).astype(CategoricalDtype(list("cab"))),
        }).set_index("B")

        result = df.sort_index()
        expected = df.iloc[[4, 0, 1, 5, 2, 3]]
        tm.assert_frame_equal(result, expected)

        result = df.sort_index(ascending=False)
        expected = df.iloc[[2, 3, 0, 1, 5, 4]]
        tm.assert_frame_equal(result, expected)

    def test_sort_index(self):
        # GH#13496

        frame = DataFrame(
            np.arange(16).reshape(4, 4),
            index=[1, 2, 3, 4],
            columns=["A", "B", "C", "D"],
        )

        # axis=0 : sort rows by index labels
        unordered = frame.loc[[3, 2, 4, 1]]
        result = unordered.sort_index(axis=0)
        expected = frame
        tm.assert_frame_equal(result, expected)

        result = unordered.sort_index(ascending=False)
        expected = frame[::-1]
        tm.assert_frame_equal(result, expected)

        # axis=1 : sort columns by column names
        unordered = frame.iloc[:, [2, 1, 3, 0]]
        result = unordered.sort_index(axis=1)
        tm.assert_frame_equal(result, frame)

        result = unordered.sort_index(axis=1, ascending=False)
        expected = frame.iloc[:, ::-1]
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("level", ["A", 0])  # GH#21052
    def test_sort_index_multiindex(self, level):
        # GH#13496

        # sort rows by specified level of multi-index
        mi = MultiIndex.from_tuples([[2, 1, 3], [2, 1, 2], [1, 1, 1]],
                                    names=list("ABC"))
        df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi)

        expected_mi = MultiIndex.from_tuples([[1, 1, 1], [2, 1, 2], [2, 1, 3]],
                                             names=list("ABC"))
        expected = DataFrame([[5, 6], [3, 4], [1, 2]], index=expected_mi)
        result = df.sort_index(level=level)
        tm.assert_frame_equal(result, expected)

        # sort_remaining=False
        expected_mi = MultiIndex.from_tuples([[1, 1, 1], [2, 1, 3], [2, 1, 2]],
                                             names=list("ABC"))
        expected = DataFrame([[5, 6], [1, 2], [3, 4]], index=expected_mi)
        result = df.sort_index(level=level, sort_remaining=False)
        tm.assert_frame_equal(result, expected)

    @td.skip_array_manager_not_yet_implemented  # TODO(ArrayManager) groupby
    def test_sort_index_intervalindex(self):
        # this is a de-facto sort via unstack
        # confirming that we sort in the order of the bins
        y = Series(np.random.randn(100))
        x1 = Series(np.sign(np.random.randn(100)))
        x2 = pd.cut(Series(np.random.randn(100)), bins=[-3, -0.5, 0, 0.5, 3])
        model = pd.concat([y, x1, x2], axis=1, keys=["Y", "X1", "X2"])

        result = model.groupby(["X1", "X2"], observed=True).mean().unstack()
        expected = IntervalIndex.from_tuples([(-3.0, -0.5), (-0.5, 0.0),
                                              (0.0, 0.5), (0.5, 3.0)],
                                             closed="right")
        result = result.columns.levels[1].categories
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize("inplace", [True, False])
    @pytest.mark.parametrize(
        "original_dict, sorted_dict, ascending, ignore_index, output_index",
        [
            ({
                "A": [1, 2, 3]
            }, {
                "A": [2, 3, 1]
            }, False, True, [0, 1, 2]),
            ({
                "A": [1, 2, 3]
            }, {
                "A": [1, 3, 2]
            }, True, True, [0, 1, 2]),
            ({
                "A": [1, 2, 3]
            }, {
                "A": [2, 3, 1]
            }, False, False, [5, 3, 2]),
            ({
                "A": [1, 2, 3]
            }, {
                "A": [1, 3, 2]
            }, True, False, [2, 3, 5]),
        ],
    )
    def test_sort_index_ignore_index(self, inplace, original_dict, sorted_dict,
                                     ascending, ignore_index, output_index):
        # GH 30114
        original_index = [2, 5, 3]
        df = DataFrame(original_dict, index=original_index)
        expected_df = DataFrame(sorted_dict, index=output_index)
        kwargs = {
            "ascending": ascending,
            "ignore_index": ignore_index,
            "inplace": inplace,
        }

        if inplace:
            result_df = df.copy()
            result_df.sort_index(**kwargs)
        else:
            result_df = df.sort_index(**kwargs)

        tm.assert_frame_equal(result_df, expected_df)
        tm.assert_frame_equal(df, DataFrame(original_dict,
                                            index=original_index))

    @pytest.mark.parametrize("inplace", [True, False])
    @pytest.mark.parametrize(
        "original_dict, sorted_dict, ascending, ignore_index, output_index",
        [
            (
                {
                    "M1": [1, 2],
                    "M2": [3, 4]
                },
                {
                    "M1": [1, 2],
                    "M2": [3, 4]
                },
                True,
                True,
                [0, 1],
            ),
            (
                {
                    "M1": [1, 2],
                    "M2": [3, 4]
                },
                {
                    "M1": [2, 1],
                    "M2": [4, 3]
                },
                False,
                True,
                [0, 1],
            ),
            (
                {
                    "M1": [1, 2],
                    "M2": [3, 4]
                },
                {
                    "M1": [1, 2],
                    "M2": [3, 4]
                },
                True,
                False,
                MultiIndex.from_tuples([[2, 1], [3, 4]], names=list("AB")),
            ),
            (
                {
                    "M1": [1, 2],
                    "M2": [3, 4]
                },
                {
                    "M1": [2, 1],
                    "M2": [4, 3]
                },
                False,
                False,
                MultiIndex.from_tuples([[3, 4], [2, 1]], names=list("AB")),
            ),
        ],
    )
    def test_sort_index_ignore_index_multi_index(self, inplace, original_dict,
                                                 sorted_dict, ascending,
                                                 ignore_index, output_index):
        # GH 30114, this is to test ignore_index on MulitIndex of index
        mi = MultiIndex.from_tuples([[2, 1], [3, 4]], names=list("AB"))
        df = DataFrame(original_dict, index=mi)
        expected_df = DataFrame(sorted_dict, index=output_index)

        kwargs = {
            "ascending": ascending,
            "ignore_index": ignore_index,
            "inplace": inplace,
        }

        if inplace:
            result_df = df.copy()
            result_df.sort_index(**kwargs)
        else:
            result_df = df.sort_index(**kwargs)

        tm.assert_frame_equal(result_df, expected_df)
        tm.assert_frame_equal(df, DataFrame(original_dict, index=mi))

    def test_sort_index_categorical_multiindex(self):
        # GH#15058
        df = DataFrame({
            "a":
            range(6),
            "l1":
            pd.Categorical(
                ["a", "a", "b", "b", "c", "c"],
                categories=["c", "a", "b"],
                ordered=True,
            ),
            "l2": [0, 1, 0, 1, 0, 1],
        })
        result = df.set_index(["l1", "l2"]).sort_index()
        expected = DataFrame(
            [4, 5, 0, 1, 2, 3],
            columns=["a"],
            index=MultiIndex(
                levels=[
                    CategoricalIndex(
                        ["c", "a", "b"],
                        categories=["c", "a", "b"],
                        ordered=True,
                        name="l1",
                        dtype="category",
                    ),
                    [0, 1],
                ],
                codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
                names=["l1", "l2"],
            ),
        )
        tm.assert_frame_equal(result, expected)

    def test_sort_index_and_reconstruction(self):

        # GH#15622
        # lexsortedness should be identical
        # across MultiIndex construction methods

        df = DataFrame([[1, 1], [2, 2]], index=list("ab"))
        expected = DataFrame(
            [[1, 1], [2, 2], [1, 1], [2, 2]],
            index=MultiIndex.from_tuples([(0.5, "a"), (0.5, "b"), (0.8, "a"),
                                          (0.8, "b")]),
        )
        assert expected.index._is_lexsorted()

        result = DataFrame(
            [[1, 1], [2, 2], [1, 1], [2, 2]],
            index=MultiIndex.from_product([[0.5, 0.8], list("ab")]),
        )
        result = result.sort_index()
        assert result.index.is_monotonic

        tm.assert_frame_equal(result, expected)

        result = DataFrame(
            [[1, 1], [2, 2], [1, 1], [2, 2]],
            index=MultiIndex(levels=[[0.5, 0.8], ["a", "b"]],
                             codes=[[0, 0, 1, 1], [0, 1, 0, 1]]),
        )
        result = result.sort_index()
        assert result.index._is_lexsorted()

        tm.assert_frame_equal(result, expected)

        concatted = pd.concat([df, df], keys=[0.8, 0.5])
        result = concatted.sort_index()

        assert result.index.is_monotonic

        tm.assert_frame_equal(result, expected)

        # GH#14015
        df = DataFrame(
            [[1, 2], [6, 7]],
            columns=MultiIndex.from_tuples(
                [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")],
                names=["l1", "Date"],
            ),
        )

        df.columns = df.columns.set_levels(pd.to_datetime(
            df.columns.levels[1]),
                                           level=1)
        assert not df.columns.is_monotonic
        result = df.sort_index(axis=1)
        assert result.columns.is_monotonic
        result = df.sort_index(axis=1, level=1)
        assert result.columns.is_monotonic

    # TODO: better name, de-duplicate with test_sort_index_level above
    def test_sort_index_level2(self):
        mi = MultiIndex(
            levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
            codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
            names=["first", "second"],
        )
        frame = DataFrame(
            np.random.randn(10, 3),
            index=mi,
            columns=Index(["A", "B", "C"], name="exp"),
        )

        df = frame.copy()
        df.index = np.arange(len(df))

        # axis=1

        # series
        a_sorted = frame["A"].sort_index(level=0)

        # preserve names
        assert a_sorted.index.names == frame.index.names

        # inplace
        rs = frame.copy()
        return_value = rs.sort_index(level=0, inplace=True)
        assert return_value is None
        tm.assert_frame_equal(rs, frame.sort_index(level=0))

    def test_sort_index_level_large_cardinality(self):

        # GH#2684 (int64)
        index = MultiIndex.from_arrays([np.arange(4000)] * 3)
        df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64)

        # it works!
        result = df.sort_index(level=0)
        assert result.index._lexsort_depth == 3

        # GH#2684 (int32)
        index = MultiIndex.from_arrays([np.arange(4000)] * 3)
        df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32)

        # it works!
        result = df.sort_index(level=0)
        assert (result.dtypes.values == df.dtypes.values).all()
        assert result.index._lexsort_depth == 3

    def test_sort_index_level_by_name(self):
        mi = MultiIndex(
            levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
            codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
            names=["first", "second"],
        )
        frame = DataFrame(
            np.random.randn(10, 3),
            index=mi,
            columns=Index(["A", "B", "C"], name="exp"),
        )

        frame.index.names = ["first", "second"]
        result = frame.sort_index(level="second")
        expected = frame.sort_index(level=1)
        tm.assert_frame_equal(result, expected)

    def test_sort_index_level_mixed(self):
        mi = MultiIndex(
            levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
            codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
            names=["first", "second"],
        )
        frame = DataFrame(
            np.random.randn(10, 3),
            index=mi,
            columns=Index(["A", "B", "C"], name="exp"),
        )

        sorted_before = frame.sort_index(level=1)

        df = frame.copy()
        df["foo"] = "bar"
        sorted_after = df.sort_index(level=1)
        tm.assert_frame_equal(sorted_before, sorted_after.drop(["foo"],
                                                               axis=1))

        dft = frame.T
        sorted_before = dft.sort_index(level=1, axis=1)
        dft["foo", "three"] = "bar"

        sorted_after = dft.sort_index(level=1, axis=1)
        tm.assert_frame_equal(
            sorted_before.drop([("foo", "three")], axis=1),
            sorted_after.drop([("foo", "three")], axis=1),
        )

    def test_sort_index_preserve_levels(self,
                                        multiindex_dataframe_random_data):
        frame = multiindex_dataframe_random_data

        result = frame.sort_index()
        assert result.index.names == frame.index.names

    @pytest.mark.parametrize(
        "gen,extra",
        [
            ([1.0, 3.0, 2.0, 5.0], 4.0),
            ([1, 3, 2, 5], 4),
            (
                [
                    Timestamp("20130101"),
                    Timestamp("20130103"),
                    Timestamp("20130102"),
                    Timestamp("20130105"),
                ],
                Timestamp("20130104"),
            ),
            (["1one", "3one", "2one", "5one"], "4one"),
        ],
    )
    def test_sort_index_multilevel_repr_8017(self, gen, extra):

        np.random.seed(0)
        data = np.random.randn(3, 4)

        columns = MultiIndex.from_tuples([("red", i) for i in gen])
        df = DataFrame(data, index=list("def"), columns=columns)
        df2 = pd.concat(
            [
                df,
                DataFrame(
                    "world",
                    index=list("def"),
                    columns=MultiIndex.from_tuples([("red", extra)]),
                ),
            ],
            axis=1,
        )

        # check that the repr is good
        # make sure that we have a correct sparsified repr
        # e.g. only 1 header of read
        assert str(df2).splitlines()[0].split() == ["red"]

        # GH 8017
        # sorting fails after columns added

        # construct single-dtype then sort
        result = df.copy().sort_index(axis=1)
        expected = df.iloc[:, [0, 2, 1, 3]]
        tm.assert_frame_equal(result, expected)

        result = df2.sort_index(axis=1)
        expected = df2.iloc[:, [0, 2, 1, 4, 3]]
        tm.assert_frame_equal(result, expected)

        # setitem then sort
        result = df.copy()
        result[("red", extra)] = "world"

        result = result.sort_index(axis=1)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "categories",
        [
            pytest.param(["a", "b", "c"], id="str"),
            pytest.param(
                [pd.Interval(0, 1),
                 pd.Interval(1, 2),
                 pd.Interval(2, 3)],
                id="pd.Interval",
            ),
        ],
    )
    def test_sort_index_with_categories(self, categories):
        # GH#23452
        df = DataFrame(
            {"foo": range(len(categories))},
            index=CategoricalIndex(data=categories,
                                   categories=categories,
                                   ordered=True),
        )
        df.index = df.index.reorder_categories(df.index.categories[::-1])
        result = df.sort_index()
        expected = DataFrame(
            {"foo": reversed(range(len(categories)))},
            index=CategoricalIndex(data=categories[::-1],
                                   categories=categories[::-1],
                                   ordered=True),
        )
        tm.assert_frame_equal(result, expected)
Example #41
0
    def test_to_html_index(self):
        index = ['foo', 'bar', 'baz']
        df = DataFrame(
            {
                'A': [1, 2, 3],
                'B': [1.2, 3.4, 5.6],
                'C': ['one', 'two', np.nan]
            },
            columns=['A', 'B', 'C'],
            index=index)
        expected_with_index = ('<table border="1" class="dataframe">\n'
                               '  <thead>\n'
                               '    <tr style="text-align: right;">\n'
                               '      <th></th>\n'
                               '      <th>A</th>\n'
                               '      <th>B</th>\n'
                               '      <th>C</th>\n'
                               '    </tr>\n'
                               '  </thead>\n'
                               '  <tbody>\n'
                               '    <tr>\n'
                               '      <th>foo</th>\n'
                               '      <td>1</td>\n'
                               '      <td>1.2</td>\n'
                               '      <td>one</td>\n'
                               '    </tr>\n'
                               '    <tr>\n'
                               '      <th>bar</th>\n'
                               '      <td>2</td>\n'
                               '      <td>3.4</td>\n'
                               '      <td>two</td>\n'
                               '    </tr>\n'
                               '    <tr>\n'
                               '      <th>baz</th>\n'
                               '      <td>3</td>\n'
                               '      <td>5.6</td>\n'
                               '      <td>NaN</td>\n'
                               '    </tr>\n'
                               '  </tbody>\n'
                               '</table>')
        assert df.to_html() == expected_with_index

        expected_without_index = ('<table border="1" class="dataframe">\n'
                                  '  <thead>\n'
                                  '    <tr style="text-align: right;">\n'
                                  '      <th>A</th>\n'
                                  '      <th>B</th>\n'
                                  '      <th>C</th>\n'
                                  '    </tr>\n'
                                  '  </thead>\n'
                                  '  <tbody>\n'
                                  '    <tr>\n'
                                  '      <td>1</td>\n'
                                  '      <td>1.2</td>\n'
                                  '      <td>one</td>\n'
                                  '    </tr>\n'
                                  '    <tr>\n'
                                  '      <td>2</td>\n'
                                  '      <td>3.4</td>\n'
                                  '      <td>two</td>\n'
                                  '    </tr>\n'
                                  '    <tr>\n'
                                  '      <td>3</td>\n'
                                  '      <td>5.6</td>\n'
                                  '      <td>NaN</td>\n'
                                  '    </tr>\n'
                                  '  </tbody>\n'
                                  '</table>')
        result = df.to_html(index=False)
        for i in index:
            assert i not in result
        assert result == expected_without_index
        df.index = Index(['foo', 'bar', 'baz'], name='idx')
        expected_with_index = ('<table border="1" class="dataframe">\n'
                               '  <thead>\n'
                               '    <tr style="text-align: right;">\n'
                               '      <th></th>\n'
                               '      <th>A</th>\n'
                               '      <th>B</th>\n'
                               '      <th>C</th>\n'
                               '    </tr>\n'
                               '    <tr>\n'
                               '      <th>idx</th>\n'
                               '      <th></th>\n'
                               '      <th></th>\n'
                               '      <th></th>\n'
                               '    </tr>\n'
                               '  </thead>\n'
                               '  <tbody>\n'
                               '    <tr>\n'
                               '      <th>foo</th>\n'
                               '      <td>1</td>\n'
                               '      <td>1.2</td>\n'
                               '      <td>one</td>\n'
                               '    </tr>\n'
                               '    <tr>\n'
                               '      <th>bar</th>\n'
                               '      <td>2</td>\n'
                               '      <td>3.4</td>\n'
                               '      <td>two</td>\n'
                               '    </tr>\n'
                               '    <tr>\n'
                               '      <th>baz</th>\n'
                               '      <td>3</td>\n'
                               '      <td>5.6</td>\n'
                               '      <td>NaN</td>\n'
                               '    </tr>\n'
                               '  </tbody>\n'
                               '</table>')
        assert df.to_html() == expected_with_index
        assert df.to_html(index=False) == expected_without_index

        tuples = [('foo', 'car'), ('foo', 'bike'), ('bar', 'car')]
        df.index = MultiIndex.from_tuples(tuples)

        expected_with_index = ('<table border="1" class="dataframe">\n'
                               '  <thead>\n'
                               '    <tr style="text-align: right;">\n'
                               '      <th></th>\n'
                               '      <th></th>\n'
                               '      <th>A</th>\n'
                               '      <th>B</th>\n'
                               '      <th>C</th>\n'
                               '    </tr>\n'
                               '  </thead>\n'
                               '  <tbody>\n'
                               '    <tr>\n'
                               '      <th rowspan="2" valign="top">foo</th>\n'
                               '      <th>car</th>\n'
                               '      <td>1</td>\n'
                               '      <td>1.2</td>\n'
                               '      <td>one</td>\n'
                               '    </tr>\n'
                               '    <tr>\n'
                               '      <th>bike</th>\n'
                               '      <td>2</td>\n'
                               '      <td>3.4</td>\n'
                               '      <td>two</td>\n'
                               '    </tr>\n'
                               '    <tr>\n'
                               '      <th>bar</th>\n'
                               '      <th>car</th>\n'
                               '      <td>3</td>\n'
                               '      <td>5.6</td>\n'
                               '      <td>NaN</td>\n'
                               '    </tr>\n'
                               '  </tbody>\n'
                               '</table>')
        assert df.to_html() == expected_with_index

        result = df.to_html(index=False)
        for i in ['foo', 'bar', 'car', 'bike']:
            assert i not in result
        # must be the same result as normal index
        assert result == expected_without_index

        df.index = MultiIndex.from_tuples(tuples, names=['idx1', 'idx2'])
        expected_with_index = ('<table border="1" class="dataframe">\n'
                               '  <thead>\n'
                               '    <tr style="text-align: right;">\n'
                               '      <th></th>\n'
                               '      <th></th>\n'
                               '      <th>A</th>\n'
                               '      <th>B</th>\n'
                               '      <th>C</th>\n'
                               '    </tr>\n'
                               '    <tr>\n'
                               '      <th>idx1</th>\n'
                               '      <th>idx2</th>\n'
                               '      <th></th>\n'
                               '      <th></th>\n'
                               '      <th></th>\n'
                               '    </tr>\n'
                               '  </thead>\n'
                               '  <tbody>\n'
                               '    <tr>\n'
                               '      <th rowspan="2" valign="top">foo</th>\n'
                               '      <th>car</th>\n'
                               '      <td>1</td>\n'
                               '      <td>1.2</td>\n'
                               '      <td>one</td>\n'
                               '    </tr>\n'
                               '    <tr>\n'
                               '      <th>bike</th>\n'
                               '      <td>2</td>\n'
                               '      <td>3.4</td>\n'
                               '      <td>two</td>\n'
                               '    </tr>\n'
                               '    <tr>\n'
                               '      <th>bar</th>\n'
                               '      <th>car</th>\n'
                               '      <td>3</td>\n'
                               '      <td>5.6</td>\n'
                               '      <td>NaN</td>\n'
                               '    </tr>\n'
                               '  </tbody>\n'
                               '</table>')
        assert df.to_html() == expected_with_index
        assert df.to_html(index=False) == expected_without_index
Example #42
0
    def test_to_html_multiindex(self):
        columns = MultiIndex.from_tuples(list(
            zip(np.arange(2).repeat(2), np.mod(lrange(4), 2))),
                                         names=['CL0', 'CL1'])
        df = DataFrame([list('abcd'), list('efgh')], columns=columns)
        result = df.to_html(justify='left')
        expected = ('<table border="1" class="dataframe">\n'
                    '  <thead>\n'
                    '    <tr>\n'
                    '      <th>CL0</th>\n'
                    '      <th colspan="2" halign="left">0</th>\n'
                    '      <th colspan="2" halign="left">1</th>\n'
                    '    </tr>\n'
                    '    <tr>\n'
                    '      <th>CL1</th>\n'
                    '      <th>0</th>\n'
                    '      <th>1</th>\n'
                    '      <th>0</th>\n'
                    '      <th>1</th>\n'
                    '    </tr>\n'
                    '  </thead>\n'
                    '  <tbody>\n'
                    '    <tr>\n'
                    '      <th>0</th>\n'
                    '      <td>a</td>\n'
                    '      <td>b</td>\n'
                    '      <td>c</td>\n'
                    '      <td>d</td>\n'
                    '    </tr>\n'
                    '    <tr>\n'
                    '      <th>1</th>\n'
                    '      <td>e</td>\n'
                    '      <td>f</td>\n'
                    '      <td>g</td>\n'
                    '      <td>h</td>\n'
                    '    </tr>\n'
                    '  </tbody>\n'
                    '</table>')

        self.assertEqual(result, expected)

        columns = MultiIndex.from_tuples(
            list(zip(range(4), np.mod(lrange(4), 2))))
        df = DataFrame([list('abcd'), list('efgh')], columns=columns)

        result = df.to_html(justify='right')
        expected = ('<table border="1" class="dataframe">\n'
                    '  <thead>\n'
                    '    <tr>\n'
                    '      <th></th>\n'
                    '      <th>0</th>\n'
                    '      <th>1</th>\n'
                    '      <th>2</th>\n'
                    '      <th>3</th>\n'
                    '    </tr>\n'
                    '    <tr>\n'
                    '      <th></th>\n'
                    '      <th>0</th>\n'
                    '      <th>1</th>\n'
                    '      <th>0</th>\n'
                    '      <th>1</th>\n'
                    '    </tr>\n'
                    '  </thead>\n'
                    '  <tbody>\n'
                    '    <tr>\n'
                    '      <th>0</th>\n'
                    '      <td>a</td>\n'
                    '      <td>b</td>\n'
                    '      <td>c</td>\n'
                    '      <td>d</td>\n'
                    '    </tr>\n'
                    '    <tr>\n'
                    '      <th>1</th>\n'
                    '      <td>e</td>\n'
                    '      <td>f</td>\n'
                    '      <td>g</td>\n'
                    '      <td>h</td>\n'
                    '    </tr>\n'
                    '  </tbody>\n'
                    '</table>')

        self.assertEqual(result, expected)
    def test_rename_multiindex(self):

        tuples_index = [("foo1", "bar1"), ("foo2", "bar2")]
        tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")]
        index = MultiIndex.from_tuples(tuples_index, names=["foo", "bar"])
        columns = MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"])
        df = DataFrame([(0, 0), (1, 1)], index=index, columns=columns)

        #
        # without specifying level -> across all levels

        renamed = df.rename(
            index={"foo1": "foo3", "bar2": "bar3"},
            columns={"fizz1": "fizz3", "buzz2": "buzz3"},
        )
        new_index = MultiIndex.from_tuples(
            [("foo3", "bar1"), ("foo2", "bar3")], names=["foo", "bar"]
        )
        new_columns = MultiIndex.from_tuples(
            [("fizz3", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"]
        )
        tm.assert_index_equal(renamed.index, new_index)
        tm.assert_index_equal(renamed.columns, new_columns)
        assert renamed.index.names == df.index.names
        assert renamed.columns.names == df.columns.names

        #
        # with specifying a level (GH13766)

        # dict
        new_columns = MultiIndex.from_tuples(
            [("fizz3", "buzz1"), ("fizz2", "buzz2")], names=["fizz", "buzz"]
        )
        renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0)
        tm.assert_index_equal(renamed.columns, new_columns)
        renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz")
        tm.assert_index_equal(renamed.columns, new_columns)

        new_columns = MultiIndex.from_tuples(
            [("fizz1", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"]
        )
        renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1)
        tm.assert_index_equal(renamed.columns, new_columns)
        renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz")
        tm.assert_index_equal(renamed.columns, new_columns)

        # function
        func = str.upper
        new_columns = MultiIndex.from_tuples(
            [("FIZZ1", "buzz1"), ("FIZZ2", "buzz2")], names=["fizz", "buzz"]
        )
        renamed = df.rename(columns=func, level=0)
        tm.assert_index_equal(renamed.columns, new_columns)
        renamed = df.rename(columns=func, level="fizz")
        tm.assert_index_equal(renamed.columns, new_columns)

        new_columns = MultiIndex.from_tuples(
            [("fizz1", "BUZZ1"), ("fizz2", "BUZZ2")], names=["fizz", "buzz"]
        )
        renamed = df.rename(columns=func, level=1)
        tm.assert_index_equal(renamed.columns, new_columns)
        renamed = df.rename(columns=func, level="buzz")
        tm.assert_index_equal(renamed.columns, new_columns)

        # index
        new_index = MultiIndex.from_tuples(
            [("foo3", "bar1"), ("foo2", "bar2")], names=["foo", "bar"]
        )
        renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0)
        tm.assert_index_equal(renamed.index, new_index)
Example #44
0
    def test_per_axis_per_level_getitem(self):

        # GH6134
        # example test case
        ix = MultiIndex.from_product(
            [_mklbl("A", 5),
             _mklbl("B", 7),
             _mklbl("C", 4),
             _mklbl("D", 2)])
        df = DataFrame(np.arange(len(ix.to_numpy())), index=ix)

        result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :]
        expected = df.loc[[(
            a,
            b,
            c,
            d,
        ) for a, b, c, d in df.index.values if (
            a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3")]]
        tm.assert_frame_equal(result, expected)

        expected = df.loc[[(
            a,
            b,
            c,
            d,
        ) for a, b, c, d in df.index.values
                           if (a == "A1" or a == "A2" or a == "A3") and (
                               c == "C1" or c == "C2" or c == "C3")]]
        result = df.loc[(slice("A1", "A3"), slice(None), slice("C1", "C3")), :]
        tm.assert_frame_equal(result, expected)

        # test multi-index slicing with per axis and per index controls
        index = MultiIndex.from_tuples([("A", 1), ("A", 2), ("A", 3),
                                        ("B", 1)],
                                       names=["one", "two"])
        columns = MultiIndex.from_tuples(
            [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")],
            names=["lvl0", "lvl1"],
        )

        df = DataFrame(np.arange(16, dtype="int64").reshape(4, 4),
                       index=index,
                       columns=columns)
        df = df.sort_index(axis=0).sort_index(axis=1)

        # identity
        result = df.loc[(slice(None), slice(None)), :]
        tm.assert_frame_equal(result, df)
        result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))]
        tm.assert_frame_equal(result, df)
        result = df.loc[:, (slice(None), slice(None))]
        tm.assert_frame_equal(result, df)

        # index
        result = df.loc[(slice(None), [1]), :]
        expected = df.iloc[[0, 3]]
        tm.assert_frame_equal(result, expected)

        result = df.loc[(slice(None), 1), :]
        expected = df.iloc[[0, 3]]
        tm.assert_frame_equal(result, expected)

        # columns
        result = df.loc[:, (slice(None), ["foo"])]
        expected = df.iloc[:, [1, 3]]
        tm.assert_frame_equal(result, expected)

        # both
        result = df.loc[(slice(None), 1), (slice(None), ["foo"])]
        expected = df.iloc[[0, 3], [1, 3]]
        tm.assert_frame_equal(result, expected)

        result = df.loc["A", "a"]
        expected = DataFrame(
            {
                "bar": [1, 5, 9],
                "foo": [0, 4, 8]
            },
            index=Index([1, 2, 3], name="two"),
            columns=Index(["bar", "foo"], name="lvl1"),
        )
        tm.assert_frame_equal(result, expected)

        result = df.loc[(slice(None), [1, 2]), :]
        expected = df.iloc[[0, 1, 3]]
        tm.assert_frame_equal(result, expected)

        # multi-level series
        s = Series(np.arange(len(ix.to_numpy())), index=ix)
        result = s.loc["A1":"A3", :, ["C1", "C3"]]
        expected = s.loc[[(
            a,
            b,
            c,
            d,
        ) for a, b, c, d in s.index.values if (
            a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3")]]
        tm.assert_series_equal(result, expected)

        # boolean indexers
        result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :]
        expected = df.iloc[[2, 3]]
        tm.assert_frame_equal(result, expected)

        msg = ("cannot index with a boolean indexer "
               "that is not the same length as the index")
        with pytest.raises(ValueError, match=msg):
            df.loc[(slice(None), np.array([True, False])), :]

        with pytest.raises(KeyError, match=r"\[1\] not in index"):
            # slice(None) is on the index, [1] is on the columns, but 1 is
            #  not in the columns, so we raise
            #  This used to treat [1] as positional GH#16396
            df.loc[slice(None), [1]]

        result = df.loc[(slice(None), [1]), :]
        expected = df.iloc[[0, 3]]
        tm.assert_frame_equal(result, expected)

        # not lexsorted
        assert df.index._lexsort_depth == 2
        df = df.sort_index(level=1, axis=0)
        assert df.index._lexsort_depth == 0

        msg = ("MultiIndex slicing requires the index to be "
               r"lexsorted: slicing on levels \[1\], lexsort depth 0")
        with pytest.raises(UnsortedIndexError, match=msg):
            df.loc[(slice(None), slice("bar")), :]

        # GH 16734: not sorted, but no real slicing
        result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :]
        tm.assert_frame_equal(result, df.iloc[[1, 3], :])
Example #45
0
def test_where_array_like(klass):
    i = MultiIndex.from_tuples([('A', 1), ('A', 2)])
    cond = [False, True]
    msg = r"\.where is not supported for MultiIndex operations"
    with pytest.raises(NotImplementedError, match=msg):
        i.where(klass(cond))
Example #46
0
def test_frame_select_complex2(setup_path):

    with ensure_clean_path(["parms.hdf", "hist.hdf"]) as paths:

        pp, hh = paths

        # use non-trivial selection criteria
        parms = DataFrame({"A": [1, 1, 2, 2, 3]})
        parms.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"])

        selection = read_hdf(pp, "df", where="A=[2,3]")
        hist = DataFrame(
            np.random.randn(25, 1),
            columns=["data"],
            index=MultiIndex.from_tuples([(i, j) for i in range(5)
                                          for j in range(5)],
                                         names=["l1", "l2"]),
        )

        hist.to_hdf(hh, "df", mode="w", format="table")

        expected = read_hdf(hh, "df", where="l1=[2, 3, 4]")

        # scope with list like
        l = selection.index.tolist()  # noqa
        store = HDFStore(hh)
        result = store.select("df", where="l1=l")
        tm.assert_frame_equal(result, expected)
        store.close()

        result = read_hdf(hh, "df", where="l1=l")
        tm.assert_frame_equal(result, expected)

        # index
        index = selection.index  # noqa
        result = read_hdf(hh, "df", where="l1=index")
        tm.assert_frame_equal(result, expected)

        result = read_hdf(hh, "df", where="l1=selection.index")
        tm.assert_frame_equal(result, expected)

        result = read_hdf(hh, "df", where="l1=selection.index.tolist()")
        tm.assert_frame_equal(result, expected)

        result = read_hdf(hh, "df", where="l1=list(selection.index)")
        tm.assert_frame_equal(result, expected)

        # scope with index
        store = HDFStore(hh)

        result = store.select("df", where="l1=index")
        tm.assert_frame_equal(result, expected)

        result = store.select("df", where="l1=selection.index")
        tm.assert_frame_equal(result, expected)

        result = store.select("df", where="l1=selection.index.tolist()")
        tm.assert_frame_equal(result, expected)

        result = store.select("df", where="l1=list(selection.index)")
        tm.assert_frame_equal(result, expected)

        store.close()
Example #47
0
 def test_construction_list_tuples_nan(self, na_value, vtype):
     # GH#18505 : valid tuples containing NaN
     values = [(1, "two"), (3.0, na_value)]
     result = Index(vtype(values))
     expected = MultiIndex.from_tuples(values)
     tm.assert_index_equal(result, expected)
Example #48
0
def decode(obj):
    """
    Decoder for deserializing numpy data types.
    """

    typ = obj.get("typ")
    if typ is None:
        return obj
    elif typ == "timestamp":
        freq = obj["freq"] if "freq" in obj else obj["offset"]
        return Timestamp(obj["value"], tz=obj["tz"], freq=freq)
    elif typ == "nat":
        return NaT
    elif typ == "period":
        return Period(ordinal=obj["ordinal"], freq=obj["freq"])
    elif typ == "index":
        dtype = dtype_for(obj["dtype"])
        data = unconvert(obj["data"], dtype, obj.get("compress"))
        return Index(data, dtype=dtype, name=obj["name"])
    elif typ == "range_index":
        return RangeIndex(obj["start"],
                          obj["stop"],
                          obj["step"],
                          name=obj["name"])
    elif typ == "multi_index":
        dtype = dtype_for(obj["dtype"])
        data = unconvert(obj["data"], dtype, obj.get("compress"))
        data = [tuple(x) for x in data]
        return MultiIndex.from_tuples(data, names=obj["names"])
    elif typ == "period_index":
        data = unconvert(obj["data"], np.int64, obj.get("compress"))
        d = dict(name=obj["name"], freq=obj["freq"])
        freq = d.pop("freq", None)
        return PeriodIndex(PeriodArray(data, freq), **d)

    elif typ == "datetime_index":
        data = unconvert(obj["data"], np.int64, obj.get("compress"))
        d = dict(name=obj["name"], freq=obj["freq"])
        result = DatetimeIndex(data, **d)
        tz = obj["tz"]

        # reverse tz conversion
        if tz is not None:
            result = result.tz_localize("UTC").tz_convert(tz)
        return result

    elif typ in ("interval_index", "interval_array"):
        return globals()[obj["klass"]].from_arrays(obj["left"],
                                                   obj["right"],
                                                   obj["closed"],
                                                   name=obj["name"])
    elif typ == "category":
        from_codes = globals()[obj["klass"]].from_codes
        return from_codes(codes=obj["codes"],
                          categories=obj["categories"],
                          ordered=obj["ordered"])

    elif typ == "interval":
        return Interval(obj["left"], obj["right"], obj["closed"])
    elif typ == "series":
        dtype = dtype_for(obj["dtype"])
        index = obj["index"]
        data = unconvert(obj["data"], dtype, obj["compress"])
        return Series(data, index=index, dtype=dtype, name=obj["name"])

    elif typ == "block_manager":
        axes = obj["axes"]

        def create_block(b):
            values = _safe_reshape(
                unconvert(b["values"], dtype_for(b["dtype"]), b["compress"]),
                b["shape"])

            # locs handles duplicate column names, and should be used instead
            # of items; see GH 9618
            if "locs" in b:
                placement = b["locs"]
            else:
                placement = axes[0].get_indexer(b["items"])

            if is_datetime64tz_dtype(b["dtype"]):
                assert isinstance(values, np.ndarray), type(values)
                assert values.dtype == "M8[ns]", values.dtype
                values = DatetimeArray(values, dtype=b["dtype"])

            return make_block(
                values=values,
                klass=getattr(internals, b["klass"]),
                placement=placement,
                dtype=b["dtype"],
            )

        blocks = [create_block(b) for b in obj["blocks"]]
        return globals()[obj["klass"]](BlockManager(blocks, axes))
    elif typ == "datetime":
        return parse(obj["data"])
    elif typ == "datetime64":
        return np.datetime64(parse(obj["data"]))
    elif typ == "date":
        return parse(obj["data"]).date()
    elif typ == "timedelta":
        return timedelta(*obj["data"])
    elif typ == "timedelta64":
        return np.timedelta64(int(obj["data"]))
    elif typ == "block_index":
        return globals()[obj["klass"]](obj["length"], obj["blocs"],
                                       obj["blengths"])
    elif typ == "int_index":
        return globals()[obj["klass"]](obj["length"], obj["indices"])
    elif typ == "ndarray":
        return unconvert(obj["data"], np.typeDict[obj["dtype"]],
                         obj.get("compress")).reshape(obj["shape"])
    elif typ == "np_scalar":
        if obj.get("sub_typ") == "np_complex":
            return c2f(obj["real"], obj["imag"], obj["dtype"])
        else:
            dtype = dtype_for(obj["dtype"])
            try:
                return dtype(obj["data"])
            except (ValueError, TypeError):
                return dtype.type(obj["data"])
    elif typ == "np_complex":
        return complex(obj["real"] + "+" + obj["imag"] + "j")
    elif isinstance(obj, (dict, list, set)):
        return obj
    else:
        return obj
Example #49
0
    def test_rename(self):
        mapping = {'A': 'a', 'B': 'b', 'C': 'c', 'D': 'd'}

        renamed = self.frame.rename(columns=mapping)
        renamed2 = self.frame.rename(columns=str.lower)

        assert_frame_equal(renamed, renamed2)
        assert_frame_equal(renamed2.rename(columns=str.upper),
                           self.frame,
                           check_names=False)

        # index
        data = {'A': {'foo': 0, 'bar': 1}}

        # gets sorted alphabetical
        df = DataFrame(data)
        renamed = df.rename(index={'foo': 'bar', 'bar': 'foo'})
        self.assert_numpy_array_equal(renamed.index, ['foo', 'bar'])

        renamed = df.rename(index=str.upper)
        self.assert_numpy_array_equal(renamed.index, ['BAR', 'FOO'])

        # have to pass something
        self.assertRaises(TypeError, self.frame.rename)

        # partial columns
        renamed = self.frame.rename(columns={'C': 'foo', 'D': 'bar'})
        self.assert_numpy_array_equal(renamed.columns,
                                      ['A', 'B', 'foo', 'bar'])

        # other axis
        renamed = self.frame.T.rename(index={'C': 'foo', 'D': 'bar'})
        self.assert_numpy_array_equal(renamed.index, ['A', 'B', 'foo', 'bar'])

        # index with name
        index = Index(['foo', 'bar'], name='name')
        renamer = DataFrame(data, index=index)
        renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'})
        self.assert_numpy_array_equal(renamed.index, ['bar', 'foo'])
        self.assertEqual(renamed.index.name, renamer.index.name)

        # MultiIndex
        tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')]
        tuples_columns = [('fizz1', 'buzz1'), ('fizz2', 'buzz2')]
        index = MultiIndex.from_tuples(tuples_index, names=['foo', 'bar'])
        columns = MultiIndex.from_tuples(tuples_columns,
                                         names=['fizz', 'buzz'])
        renamer = DataFrame([(0, 0), (1, 1)], index=index, columns=columns)
        renamed = renamer.rename(index={
            'foo1': 'foo3',
            'bar2': 'bar3'
        },
                                 columns={
                                     'fizz1': 'fizz3',
                                     'buzz2': 'buzz3'
                                 })
        new_index = MultiIndex.from_tuples([('foo3', 'bar1'),
                                            ('foo2', 'bar3')])
        new_columns = MultiIndex.from_tuples([('fizz3', 'buzz1'),
                                              ('fizz2', 'buzz3')])
        self.assert_numpy_array_equal(renamed.index, new_index)
        self.assert_numpy_array_equal(renamed.columns, new_columns)
        self.assertEqual(renamed.index.names, renamer.index.names)
        self.assertEqual(renamed.columns.names, renamer.columns.names)
Example #50
0
def test_difference(idx, sort):

    first = idx
    result = first.difference(idx[-3:], sort=sort)
    vals = idx[:-3].values

    if sort is None:
        vals = sorted(vals)

    expected = MultiIndex.from_tuples(vals, sortorder=0, names=idx.names)

    assert isinstance(result, MultiIndex)
    assert result.equals(expected)
    assert result.names == idx.names
    tm.assert_index_equal(result, expected)

    # empty difference: reflexive
    result = idx.difference(idx, sort=sort)
    expected = idx[:0]
    assert result.equals(expected)
    assert result.names == idx.names

    # empty difference: superset
    result = idx[-3:].difference(idx, sort=sort)
    expected = idx[:0]
    assert result.equals(expected)
    assert result.names == idx.names

    # empty difference: degenerate
    result = idx[:0].difference(idx, sort=sort)
    expected = idx[:0]
    assert result.equals(expected)
    assert result.names == idx.names

    # names not the same
    chunklet = idx[-3:]
    chunklet.names = ["foo", "baz"]
    result = first.difference(chunklet, sort=sort)
    assert result.names == (None, None)

    # empty, but non-equal
    result = idx.difference(idx.sortlevel(1)[0], sort=sort)
    assert len(result) == 0

    # raise Exception called with non-MultiIndex
    result = first.difference(first.values, sort=sort)
    assert result.equals(first[:0])

    # name from empty array
    result = first.difference([], sort=sort)
    assert first.equals(result)
    assert first.names == result.names

    # name from non-empty array
    result = first.difference([("foo", "one")], sort=sort)
    expected = pd.MultiIndex.from_tuples(
        [("bar", "one"), ("baz", "two"), ("foo", "two"), ("qux", "one"), ("qux", "two")]
    )
    expected.names = first.names
    assert first.names == result.names

    msg = "other must be a MultiIndex or a list of tuples"
    with pytest.raises(TypeError, match=msg):
        first.difference([1, 2, 3, 4, 5], sort=sort)
    def test_column_dups_operations(self):
        def check(result, expected=None):
            if expected is not None:
                tm.assert_frame_equal(result, expected)
            result.dtypes
            str(result)

        # assignment
        # GH 3687
        arr = np.random.randn(3, 2)
        idx = list(range(2))
        df = DataFrame(arr, columns=["A", "A"])
        df.columns = idx
        expected = DataFrame(arr, columns=idx)
        check(df, expected)

        idx = date_range("20130101", periods=4, freq="Q-NOV")
        df = DataFrame(
            [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["a", "a", "a", "a"]
        )
        df.columns = idx
        expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
        check(df, expected)

        # insert
        df = DataFrame(
            [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
            columns=["foo", "bar", "foo", "hello"],
        )
        df["string"] = "bah"
        expected = DataFrame(
            [[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]],
            columns=["foo", "bar", "foo", "hello", "string"],
        )
        check(df, expected)
        with pytest.raises(ValueError, match="Length of value"):
            df.insert(0, "AnotherColumn", range(len(df.index) - 1))

        # insert same dtype
        df["foo2"] = 3
        expected = DataFrame(
            [[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3], [2, 1, 3, 5, "bah", 3]],
            columns=["foo", "bar", "foo", "hello", "string", "foo2"],
        )
        check(df, expected)

        # set (non-dup)
        df["foo2"] = 4
        expected = DataFrame(
            [[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4], [2, 1, 3, 5, "bah", 4]],
            columns=["foo", "bar", "foo", "hello", "string", "foo2"],
        )
        check(df, expected)
        df["foo2"] = 3

        # delete (non dup)
        del df["bar"]
        expected = DataFrame(
            [[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]],
            columns=["foo", "foo", "hello", "string", "foo2"],
        )
        check(df, expected)

        # try to delete again (its not consolidated)
        del df["hello"]
        expected = DataFrame(
            [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
            columns=["foo", "foo", "string", "foo2"],
        )
        check(df, expected)

        # consolidate
        df = df._consolidate()
        expected = DataFrame(
            [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
            columns=["foo", "foo", "string", "foo2"],
        )
        check(df, expected)

        # insert
        df.insert(2, "new_col", 5.0)
        expected = DataFrame(
            [[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3], [2, 3, 5.0, "bah", 3]],
            columns=["foo", "foo", "new_col", "string", "foo2"],
        )
        check(df, expected)

        # insert a dup
        with pytest.raises(ValueError, match="cannot insert"):
            df.insert(2, "new_col", 4.0)

        df.insert(2, "new_col", 4.0, allow_duplicates=True)
        expected = DataFrame(
            [
                [1, 1, 4.0, 5.0, "bah", 3],
                [1, 2, 4.0, 5.0, "bah", 3],
                [2, 3, 4.0, 5.0, "bah", 3],
            ],
            columns=["foo", "foo", "new_col", "new_col", "string", "foo2"],
        )
        check(df, expected)

        # delete (dup)
        del df["foo"]
        expected = DataFrame(
            [[4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3]],
            columns=["new_col", "new_col", "string", "foo2"],
        )
        tm.assert_frame_equal(df, expected)

        # dup across dtypes
        df = DataFrame(
            [[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]],
            columns=["foo", "bar", "foo", "hello"],
        )
        check(df)

        df["foo2"] = 7.0
        expected = DataFrame(
            [[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]],
            columns=["foo", "bar", "foo", "hello", "foo2"],
        )
        check(df, expected)

        result = df["foo"]
        expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]], columns=["foo", "foo"])
        check(result, expected)

        # multiple replacements
        df["foo"] = "string"
        expected = DataFrame(
            [
                ["string", 1, "string", 5, 7.0],
                ["string", 1, "string", 5, 7.0],
                ["string", 1, "string", 5, 7.0],
            ],
            columns=["foo", "bar", "foo", "hello", "foo2"],
        )
        check(df, expected)

        del df["foo"]
        expected = DataFrame(
            [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "hello", "foo2"]
        )
        check(df, expected)

        # values
        df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"])
        result = df.values
        expected = np.array([[1, 2.5], [3, 4.5]])
        assert (result == expected).all().all()

        # rename, GH 4403
        df4 = DataFrame(
            {"RT": [0.0454], "TClose": [22.02], "TExg": [0.0422]},
            index=MultiIndex.from_tuples(
                [(600809, 20130331)], names=["STK_ID", "RPT_Date"]
            ),
        )

        df5 = DataFrame(
            {
                "RPT_Date": [20120930, 20121231, 20130331],
                "STK_ID": [600809] * 3,
                "STK_Name": ["饡驦", "饡驦", "饡驦"],
                "TClose": [38.05, 41.66, 30.01],
            },
            index=MultiIndex.from_tuples(
                [(600809, 20120930), (600809, 20121231), (600809, 20130331)],
                names=["STK_ID", "RPT_Date"],
            ),
        )

        k = pd.merge(df4, df5, how="inner", left_index=True, right_index=True)
        result = k.rename(columns={"TClose_x": "TClose", "TClose_y": "QT_Close"})
        str(result)
        result.dtypes

        expected = DataFrame(
            [[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]],
            columns=[
                "RT",
                "TClose",
                "TExg",
                "RPT_Date",
                "STK_ID",
                "STK_Name",
                "QT_Close",
            ],
        ).set_index(["STK_ID", "RPT_Date"], drop=False)
        tm.assert_frame_equal(result, expected)

        # reindex is invalid!
        df = DataFrame(
            [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
        )
        msg = "cannot reindex from a duplicate axis"
        with pytest.raises(ValueError, match=msg):
            df.reindex(columns=["bar"])
        with pytest.raises(ValueError, match=msg):
            df.reindex(columns=["bar", "foo"])

        # drop
        df = DataFrame(
            [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
        )
        result = df.drop(["a"], axis=1)
        expected = DataFrame([[1], [1], [1]], columns=["bar"])
        check(result, expected)
        result = df.drop("a", axis=1)
        check(result, expected)

        # describe
        df = DataFrame(
            [[1, 1, 1], [2, 2, 2], [3, 3, 3]],
            columns=["bar", "a", "a"],
            dtype="float64",
        )
        result = df.describe()
        s = df.iloc[:, 0].describe()
        expected = pd.concat([s, s, s], keys=df.columns, axis=1)
        check(result, expected)

        # check column dups with index equal and not equal to df's index
        df = DataFrame(
            np.random.randn(5, 3),
            index=["a", "b", "c", "d", "e"],
            columns=["A", "B", "A"],
        )
        for index in [df.index, pd.Index(list("edcba"))]:
            this_df = df.copy()
            expected_ser = Series(index.values, index=this_df.index)
            expected_df = DataFrame(
                {"A": expected_ser, "B": this_df["B"], "A": expected_ser},
                columns=["A", "B", "A"],
            )
            this_df["A"] = index
            check(this_df, expected_df)

        # operations
        for op in ["__add__", "__mul__", "__sub__", "__truediv__"]:
            df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
            expected = getattr(df, op)(df)
            expected.columns = ["A", "A"]
            df.columns = ["A", "A"]
            result = getattr(df, op)(df)
            check(result, expected)

        # multiple assignments that change dtypes
        # the location indexer is a slice
        # GH 6120
        df = DataFrame(np.random.randn(5, 2), columns=["that", "that"])
        expected = DataFrame(1.0, index=range(5), columns=["that", "that"])

        df["that"] = 1.0
        check(df, expected)

        df = DataFrame(np.random.rand(5, 2), columns=["that", "that"])
        expected = DataFrame(1, index=range(5), columns=["that", "that"])

        df["that"] = 1
        check(df, expected)
    def test_column_dups_operations(self):

        def check(result, expected=None):
            if expected is not None:
                assert_frame_equal(result, expected)
            result.dtypes
            str(result)

        # assignment
        # GH 3687
        arr = np.random.randn(3, 2)
        idx = lrange(2)
        df = DataFrame(arr, columns=['A', 'A'])
        df.columns = idx
        expected = DataFrame(arr, columns=idx)
        check(df, expected)

        idx = date_range('20130101', periods=4, freq='Q-NOV')
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['a', 'a', 'a', 'a'])
        df.columns = idx
        expected = DataFrame(
            [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
        check(df, expected)

        # insert
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        df['string'] = 'bah'
        expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'],
                              [2, 1, 3, 5, 'bah']],
                             columns=['foo', 'bar', 'foo', 'hello', 'string'])
        check(df, expected)
        with pytest.raises(ValueError, match='Length of value'):
            df.insert(0, 'AnotherColumn', range(len(df.index) - 1))

        # insert same dtype
        df['foo2'] = 3
        expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3],
                              [2, 1, 3, 5, 'bah', 3]],
                             columns=['foo', 'bar', 'foo', 'hello',
                                      'string', 'foo2'])
        check(df, expected)

        # set (non-dup)
        df['foo2'] = 4
        expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4],
                              [2, 1, 3, 5, 'bah', 4]],
                             columns=['foo', 'bar', 'foo', 'hello',
                                      'string', 'foo2'])
        check(df, expected)
        df['foo2'] = 3

        # delete (non dup)
        del df['bar']
        expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3],
                              [2, 3, 5, 'bah', 3]],
                             columns=['foo', 'foo', 'hello', 'string', 'foo2'])
        check(df, expected)

        # try to delete again (its not consolidated)
        del df['hello']
        expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
                              [2, 3, 'bah', 3]],
                             columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # consolidate
        df = df._consolidate()
        expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
                              [2, 3, 'bah', 3]],
                             columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # insert
        df.insert(2, 'new_col', 5.)
        expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3],
                              [2, 3, 5., 'bah', 3]],
                             columns=['foo', 'foo', 'new_col', 'string',
                                      'foo2'])
        check(df, expected)

        # insert a dup
        with pytest.raises(ValueError, match='cannot insert'):
            df.insert(2, 'new_col', 4.)

        df.insert(2, 'new_col', 4., allow_duplicates=True)
        expected = DataFrame([[1, 1, 4., 5., 'bah', 3],
                              [1, 2, 4., 5., 'bah', 3],
                              [2, 3, 4., 5., 'bah', 3]],
                             columns=['foo', 'foo', 'new_col',
                                      'new_col', 'string', 'foo2'])
        check(df, expected)

        # delete (dup)
        del df['foo']
        expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3],
                              [4., 5., 'bah', 3]],
                             columns=['new_col', 'new_col', 'string', 'foo2'])
        assert_frame_equal(df, expected)

        # dup across dtypes
        df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        check(df)

        df['foo2'] = 7.
        expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.],
                              [2, 1, 3., 5, 7.]],
                             columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        result = df['foo']
        expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]],
                             columns=['foo', 'foo'])
        check(result, expected)

        # multiple replacements
        df['foo'] = 'string'
        expected = DataFrame([['string', 1, 'string', 5, 7.],
                              ['string', 1, 'string', 5, 7.],
                              ['string', 1, 'string', 5, 7.]],
                             columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        del df['foo']
        expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[
                             'bar', 'hello', 'foo2'])
        check(df, expected)

        # values
        df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x'])
        result = df.values
        expected = np.array([[1, 2.5], [3, 4.5]])
        assert (result == expected).all().all()

        # rename, GH 4403
        df4 = DataFrame(
            {'RT': [0.0454],
             'TClose': [22.02],
             'TExg': [0.0422]},
            index=MultiIndex.from_tuples([(600809, 20130331)],
                                         names=['STK_ID', 'RPT_Date']))

        df5 = DataFrame({'RPT_Date': [20120930, 20121231, 20130331],
                         'STK_ID': [600809] * 3,
                         'STK_Name': ['饡驦', '饡驦', '饡驦'],
                         'TClose': [38.05, 41.66, 30.01]},
                        index=MultiIndex.from_tuples(
                            [(600809, 20120930),
                             (600809, 20121231),
                             (600809, 20130331)],
                            names=['STK_ID', 'RPT_Date']))

        k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True)
        result = k.rename(
            columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'})
        str(result)
        result.dtypes

        expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809,
                                '饡驦', 30.01]],
                              columns=['RT', 'TClose', 'TExg',
                                       'RPT_Date', 'STK_ID', 'STK_Name',
                                       'QT_Close'])
                    .set_index(['STK_ID', 'RPT_Date'], drop=False))
        assert_frame_equal(result, expected)

        # reindex is invalid!
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        msg = "cannot reindex from a duplicate axis"
        with pytest.raises(ValueError, match=msg):
            df.reindex(columns=['bar'])
        with pytest.raises(ValueError, match=msg):
            df.reindex(columns=['bar', 'foo'])

        # drop
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        result = df.drop(['a'], axis=1)
        expected = DataFrame([[1], [1], [1]], columns=['bar'])
        check(result, expected)
        result = df.drop('a', axis=1)
        check(result, expected)

        # describe
        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
                       columns=['bar', 'a', 'a'], dtype='float64')
        result = df.describe()
        s = df.iloc[:, 0].describe()
        expected = pd.concat([s, s, s], keys=df.columns, axis=1)
        check(result, expected)

        # check column dups with index equal and not equal to df's index
        df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
                       columns=['A', 'B', 'A'])
        for index in [df.index, pd.Index(list('edcba'))]:
            this_df = df.copy()
            expected_ser = pd.Series(index.values, index=this_df.index)
            expected_df = DataFrame({'A': expected_ser,
                                     'B': this_df['B'],
                                     'A': expected_ser},
                                    columns=['A', 'B', 'A'])
            this_df['A'] = index
            check(this_df, expected_df)

        # operations
        for op in ['__add__', '__mul__', '__sub__', '__truediv__']:
            df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
            expected = getattr(df, op)(df)
            expected.columns = ['A', 'A']
            df.columns = ['A', 'A']
            result = getattr(df, op)(df)
            check(result, expected)

        # multiple assignments that change dtypes
        # the location indexer is a slice
        # GH 6120
        df = DataFrame(np.random.randn(5, 2), columns=['that', 'that'])
        expected = DataFrame(1.0, index=range(5), columns=['that', 'that'])

        df['that'] = 1.0
        check(df, expected)

        df = DataFrame(np.random.rand(5, 2), columns=['that', 'that'])
        expected = DataFrame(1, index=range(5), columns=['that', 'that'])

        df['that'] = 1
        check(df, expected)
Example #53
0
class TestDataFrameConvertTo(TestData):
    def test_to_dict_timestamp(self):

        # GH11247
        # split/records producing np.datetime64 rather than Timestamps
        # on datetime64[ns] dtypes only

        tsmp = Timestamp("20130101")
        test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]})
        test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]})

        expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}]
        expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}]

        assert test_data.to_dict(orient="records") == expected_records
        assert test_data_mixed.to_dict(
            orient="records") == expected_records_mixed

        expected_series = {
            "A": Series([tsmp, tsmp], name="A"),
            "B": Series([tsmp, tsmp], name="B"),
        }
        expected_series_mixed = {
            "A": Series([tsmp, tsmp], name="A"),
            "B": Series([1, 2], name="B"),
        }

        tm.assert_dict_equal(test_data.to_dict(orient="series"),
                             expected_series)
        tm.assert_dict_equal(test_data_mixed.to_dict(orient="series"),
                             expected_series_mixed)

        expected_split = {
            "index": [0, 1],
            "data": [[tsmp, tsmp], [tsmp, tsmp]],
            "columns": ["A", "B"],
        }
        expected_split_mixed = {
            "index": [0, 1],
            "data": [[tsmp, 1], [tsmp, 2]],
            "columns": ["A", "B"],
        }

        tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split)
        tm.assert_dict_equal(test_data_mixed.to_dict(orient="split"),
                             expected_split_mixed)

    def test_to_dict_index_not_unique_with_index_orient(self):
        # GH22801
        # Data loss when indexes are not unique. Raise ValueError.
        df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"])
        msg = "DataFrame index must be unique for orient='index'"
        with pytest.raises(ValueError, match=msg):
            df.to_dict(orient="index")

    def test_to_dict_invalid_orient(self):
        df = DataFrame({"A": [0, 1]})
        msg = "orient 'xinvalid' not understood"
        with pytest.raises(ValueError, match=msg):
            df.to_dict(orient="xinvalid")

    def test_to_records_dt64(self):
        df = DataFrame(
            [["one", "two", "three"], ["four", "five", "six"]],
            index=date_range("2012-01-01", "2012-01-02"),
        )

        # convert_datetime64 defaults to None
        expected = df.index.values[0]
        result = df.to_records()["index"][0]
        assert expected == result

        # check for FutureWarning if convert_datetime64=False is passed
        with tm.assert_produces_warning(FutureWarning):
            expected = df.index.values[0]
            result = df.to_records(convert_datetime64=False)["index"][0]
            assert expected == result

        # check for FutureWarning if convert_datetime64=True is passed
        with tm.assert_produces_warning(FutureWarning):
            expected = df.index[0]
            result = df.to_records(convert_datetime64=True)["index"][0]
            assert expected == result

    def test_to_records_with_multindex(self):
        # GH3189
        index = [
            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
            ["one", "two", "one", "two", "one", "two", "one", "two"],
        ]
        data = np.zeros((8, 4))
        df = DataFrame(data, index=index)
        r = df.to_records(index=True)["level_0"]
        assert "bar" in r
        assert "one" not in r

    def test_to_records_with_Mapping_type(self):
        import email
        from email.parser import Parser

        abc.Mapping.register(email.message.Message)

        headers = Parser().parsestr("From: <*****@*****.**>\n"
                                    "To: <*****@*****.**>\n"
                                    "Subject: Test message\n"
                                    "\n"
                                    "Body would go here\n")

        frame = DataFrame.from_records([headers])
        all(x in frame for x in ["Type", "Subject", "From"])

    def test_to_records_floats(self):
        df = DataFrame(np.random.rand(10, 10))
        df.to_records()

    def test_to_records_index_name(self):
        df = DataFrame(np.random.randn(3, 3))
        df.index.name = "X"
        rs = df.to_records()
        assert "X" in rs.dtype.fields

        df = DataFrame(np.random.randn(3, 3))
        rs = df.to_records()
        assert "index" in rs.dtype.fields

        df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
        df.index.names = ["A", None]
        rs = df.to_records()
        assert "level_0" in rs.dtype.fields

    def test_to_records_with_unicode_index(self):
        # GH13172
        # unicode_literals conflict with to_records
        result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records()
        expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")])
        tm.assert_almost_equal(result, expected)

    def test_to_records_with_unicode_column_names(self):
        # xref issue: https://github.com/numpy/numpy/issues/2407
        # Issue #11879. to_records used to raise an exception when used
        # with column names containing non-ascii characters in Python 2
        result = DataFrame(data={"accented_name_é": [1.0]}).to_records()

        # Note that numpy allows for unicode field names but dtypes need
        # to be specified using dictionary instead of list of tuples.
        expected = np.rec.array(
            [(0, 1.0)],
            dtype={
                "names": ["index", "accented_name_é"],
                "formats": ["=i8", "=f8"]
            },
        )
        tm.assert_almost_equal(result, expected)

    def test_to_records_with_categorical(self):

        # GH8626

        # dict creation
        df = DataFrame({"A": list("abc")}, dtype="category")
        expected = Series(list("abc"), dtype="category", name="A")
        tm.assert_series_equal(df["A"], expected)

        # list-like creation
        df = DataFrame(list("abc"), dtype="category")
        expected = Series(list("abc"), dtype="category", name=0)
        tm.assert_series_equal(df[0], expected)

        # to record array
        # this coerces
        result = df.to_records()
        expected = np.rec.array([(0, "a"), (1, "b"), (2, "c")],
                                dtype=[("index", "=i8"), ("0", "O")])
        tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize(
        "kwargs,expected",
        [
            # No dtypes --> default to array dtypes.
            (
                dict(),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Should have no effect in this case.
            (
                dict(index=True),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Column dtype applied across the board. Index unaffected.
            (
                dict(column_dtypes="<U4"),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "<U4"), ("B", "<U4"),
                           ("C", "<U4")],
                ),
            ),
            # Index dtype applied across the board. Columns unaffected.
            (
                dict(index_dtypes="<U1"),
                np.rec.array(
                    [("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
                    dtype=[("index", "<U1"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Pass in a type instance.
            (
                dict(column_dtypes=np.unicode),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"),
                           ("C", "<U")],
                ),
            ),
            # Pass in a dtype instance.
            (
                dict(column_dtypes=np.dtype("unicode")),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"),
                           ("C", "<U")],
                ),
            ),
            # Pass in a dictionary (name-only).
            (
                dict(column_dtypes={
                    "A": np.int8,
                    "B": np.float32,
                    "C": "<U2"
                }),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"),
                           ("C", "<U2")],
                ),
            ),
            # Pass in a dictionary (indices-only).
            (
                dict(index_dtypes={0: "int16"}),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Ignore index mappings if index is not True.
            (
                dict(index=False, index_dtypes="<U2"),
                np.rec.array(
                    [(1, 0.2, "a"), (2, 1.5, "bc")],
                    dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")],
                ),
            ),
            # Non-existent names / indices in mapping should not error.
            (
                dict(index_dtypes={
                    0: "int16",
                    "not-there": "float32"
                }),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Names / indices not in mapping default to array dtype.
            (
                dict(column_dtypes={
                    "A": np.int8,
                    "B": np.float32
                }),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"),
                           ("C", "O")],
                ),
            ),
            # Names / indices not in dtype mapping default to array dtype.
            (
                dict(column_dtypes={
                    "A": np.dtype("int8"),
                    "B": np.dtype("float32")
                }),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"),
                           ("C", "O")],
                ),
            ),
            # Mixture of everything.
            (
                dict(column_dtypes={
                    "A": np.int8,
                    "B": np.float32
                },
                     index_dtypes="<U2"),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"),
                           ("C", "O")],
                ),
            ),
            # Invalid dype values.
            (
                dict(index=False, column_dtypes=list()),
                (ValueError, "Invalid dtype \\[\\] specified for column A"),
            ),
            (
                dict(index=False, column_dtypes={
                    "A": "int32",
                    "B": 5
                }),
                (ValueError, "Invalid dtype 5 specified for column B"),
            ),
            # Numpy can't handle EA types, so check error is raised
            (
                dict(
                    index=False,
                    column_dtypes={
                        "A": "int32",
                        "B": CategoricalDtype(["a", "b"])
                    },
                ),
                (ValueError, "Invalid dtype category specified for column B"),
            ),
            # Check that bad types raise
            (
                dict(index=False, column_dtypes={
                    "A": "int32",
                    "B": "foo"
                }),
                (TypeError, 'data type "foo" not understood'),
            ),
        ],
    )
    @pytest.mark.skipif(not is_platform_little_endian(),
                        reason="expected values assume little-endian")
    def test_to_records_dtype(self, kwargs, expected):
        # see gh-18146
        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

        if not isinstance(expected, np.recarray):
            with pytest.raises(expected[0], match=expected[1]):
                df.to_records(**kwargs)
        else:
            result = df.to_records(**kwargs)
            tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize(
        "df,kwargs,expected",
        [
            # MultiIndex in the index.
            (
                DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                          columns=list("abc")).set_index(["a", "b"]),
                dict(column_dtypes="float64",
                     index_dtypes={
                         0: "int32",
                         1: "int8"
                     }),
                np.rec.array(
                    [(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)],
                    dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")],
                ),
            ),
            # MultiIndex in the columns.
            (
                DataFrame(
                    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                    columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
                                                    ("c", "f")]),
                ),
                dict(column_dtypes={
                    0: "<U1",
                    2: "float32"
                },
                     index_dtypes="float32"),
                np.rec.array(
                    [(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0),
                     (2.0, "7", 8, 9.0)],
                    dtype=[
                        ("index", "<f4"),
                        ("('a', 'd')", "<U1"),
                        ("('b', 'e')", "<i8"),
                        ("('c', 'f')", "<f4"),
                    ],
                ),
            ),
            # MultiIndex in both the columns and index.
            (
                DataFrame(
                    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                    columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
                                                    ("c", "f")],
                                                   names=list("ab")),
                    index=MultiIndex.from_tuples([("d", -4), ("d", -5),
                                                  ("f", -6)],
                                                 names=list("cd")),
                ),
                dict(column_dtypes="float64",
                     index_dtypes={
                         0: "<U2",
                         1: "int8"
                     }),
                np.rec.array(
                    [
                        ("d", -4, 1.0, 2.0, 3.0),
                        ("d", -5, 4.0, 5.0, 6.0),
                        ("f", -6, 7, 8, 9.0),
                    ],
                    dtype=[
                        ("c", "<U2"),
                        ("d", "i1"),
                        ("('a', 'd')", "<f8"),
                        ("('b', 'e')", "<f8"),
                        ("('c', 'f')", "<f8"),
                    ],
                ),
            ),
        ],
    )
    @pytest.mark.skipif(not is_platform_little_endian(),
                        reason="expected values assume little-endian")
    def test_to_records_dtype_mi(self, df, kwargs, expected):
        # see gh-18146
        result = df.to_records(**kwargs)
        tm.assert_almost_equal(result, expected)

    @pytest.mark.skipif(not is_platform_little_endian(),
                        reason="expected values assume little-endian")
    def test_to_records_dict_like(self):
        # see gh-18146
        class DictLike:
            def __init__(self, **kwargs):
                self.d = kwargs.copy()

            def __getitem__(self, key):
                return self.d.__getitem__(key)

            def __contains__(self, key):
                return key in self.d

            def keys(self):
                return self.d.keys()

        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

        dtype_mappings = dict(
            column_dtypes=DictLike(**{
                "A": np.int8,
                "B": np.float32
            }),
            index_dtypes="<U2",
        )

        result = df.to_records(**dtype_mappings)
        expected = np.rec.array(
            [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
            dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")],
        )
        tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict])
    def test_to_dict(self, mapping):
        test_data = {
            "A": {
                "1": 1,
                "2": 2
            },
            "B": {
                "1": "1",
                "2": "2",
                "3": "3"
            }
        }

        # GH16122
        recons_data = DataFrame(test_data).to_dict(into=mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k][k2]

        recons_data = DataFrame(test_data).to_dict("l", mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k][int(k2) - 1]

        recons_data = DataFrame(test_data).to_dict("s", mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k][k2]

        recons_data = DataFrame(test_data).to_dict("sp", mapping)
        expected_split = {
            "columns": ["A", "B"],
            "index": ["1", "2", "3"],
            "data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]],
        }
        tm.assert_dict_equal(recons_data, expected_split)

        recons_data = DataFrame(test_data).to_dict("r", mapping)
        expected_records = [
            {
                "A": 1.0,
                "B": "1"
            },
            {
                "A": 2.0,
                "B": "2"
            },
            {
                "A": np.nan,
                "B": "3"
            },
        ]
        assert isinstance(recons_data, list)
        assert len(recons_data) == 3
        for l, r in zip(recons_data, expected_records):
            tm.assert_dict_equal(l, r)

        # GH10844
        recons_data = DataFrame(test_data).to_dict("i")

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k2][k]

        df = DataFrame(test_data)
        df["duped"] = df[df.columns[0]]
        recons_data = df.to_dict("i")
        comp_data = test_data.copy()
        comp_data["duped"] = comp_data[df.columns[0]]
        for k, v in comp_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k2][k]

    @pytest.mark.parametrize("mapping", [list, defaultdict, []])
    def test_to_dict_errors(self, mapping):
        # GH16122
        df = DataFrame(np.random.randn(3, 3))
        with pytest.raises(TypeError):
            df.to_dict(into=mapping)

    def test_to_dict_not_unique_warning(self):
        # GH16927: When converting to a dict, if a column has a non-unique name
        # it will be dropped, throwing a warning.
        df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"])
        with tm.assert_produces_warning(UserWarning):
            df.to_dict()

    @pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"])
    def test_to_records_datetimeindex_with_tz(self, tz):
        # GH13937
        dr = date_range("2016-01-01", periods=10, freq="S", tz=tz)

        df = DataFrame({"datetime": dr}, index=dr)

        expected = df.to_records()
        result = df.tz_convert("UTC").to_records()

        # both converted to UTC, so they are equal
        tm.assert_numpy_array_equal(result, expected)

    # orient - orient argument to to_dict function
    # item_getter - function for extracting value from
    # the resulting dict using column name and index
    @pytest.mark.parametrize(
        "orient,item_getter",
        [
            ("dict", lambda d, col, idx: d[col][idx]),
            ("records", lambda d, col, idx: d[idx][col]),
            ("list", lambda d, col, idx: d[col][idx]),
            ("split",
             lambda d, col, idx: d["data"][idx][d["columns"].index(col)]),
            ("index", lambda d, col, idx: d[idx][col]),
        ],
    )
    def test_to_dict_box_scalars(self, orient, item_getter):
        # 14216, 23753
        # make sure that we are boxing properly
        df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]})
        result = df.to_dict(orient=orient)
        assert isinstance(item_getter(result, "a", 0), int)
        assert isinstance(item_getter(result, "b", 0), float)

    def test_frame_to_dict_tz(self):
        # GH18372 When converting to dict with orient='records' columns of
        # datetime that are tz-aware were not converted to required arrays
        data = [
            (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc), ),
            (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc), ),
        ]
        df = DataFrame(list(data), columns=["d"])

        result = df.to_dict(orient="records")
        expected = [
            {
                "d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)
            },
            {
                "d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)
            },
        ]
        tm.assert_dict_equal(result[0], expected[0])
        tm.assert_dict_equal(result[1], expected[1])

    @pytest.mark.parametrize(
        "into, expected",
        [
            (
                dict,
                {
                    0: {
                        "int_col": 1,
                        "float_col": 1.0
                    },
                    1: {
                        "int_col": 2,
                        "float_col": 2.0
                    },
                    2: {
                        "int_col": 3,
                        "float_col": 3.0
                    },
                },
            ),
            (
                OrderedDict,
                OrderedDict([
                    (0, {
                        "int_col": 1,
                        "float_col": 1.0
                    }),
                    (1, {
                        "int_col": 2,
                        "float_col": 2.0
                    }),
                    (2, {
                        "int_col": 3,
                        "float_col": 3.0
                    }),
                ]),
            ),
            (
                defaultdict(list),
                defaultdict(
                    list,
                    {
                        0: {
                            "int_col": 1,
                            "float_col": 1.0
                        },
                        1: {
                            "int_col": 2,
                            "float_col": 2.0
                        },
                        2: {
                            "int_col": 3,
                            "float_col": 3.0
                        },
                    },
                ),
            ),
        ],
    )
    def test_to_dict_index_dtypes(self, into, expected):
        # GH 18580
        # When using to_dict(orient='index') on a dataframe with int
        # and float columns only the int columns were cast to float

        df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]})

        result = df.to_dict(orient="index", into=into)
        cols = ["int_col", "float_col"]
        result = DataFrame.from_dict(result, orient="index")[cols]
        expected = DataFrame.from_dict(expected, orient="index")[cols]
        tm.assert_frame_equal(result, expected)

    def test_to_dict_numeric_names(self):
        # https://github.com/pandas-dev/pandas/issues/24940
        df = DataFrame({str(i): [i] for i in range(5)})
        result = set(df.to_dict("records")[0].keys())
        expected = set(df.columns)
        assert result == expected

    def test_to_dict_wide(self):
        # https://github.com/pandas-dev/pandas/issues/24939
        df = DataFrame({("A_{:d}".format(i)): [i] for i in range(256)})
        result = df.to_dict("records")[0]
        expected = {"A_{:d}".format(i): i for i in range(256)}
        assert result == expected
Example #54
0
 def test_rename_mi(self):
     df = DataFrame([11, 21, 31],
                    index=MultiIndex.from_tuples([
                        ("A", x) for x in ["a", "B", "c"]
                    ]))
     df.rename(str.lower)
Example #55
0
def makeCustomIndex(nentries,
                    nlevels,
                    prefix="#",
                    names=False,
                    ndupe_l=None,
                    idx_type=None):
    """
    Create an index/multindex with given dimensions, levels, names, etc'

    nentries - number of entries in index
    nlevels - number of levels (> 1 produces multindex)
    prefix - a string prefix for labels
    names - (Optional), bool or list of strings. if True will use default
       names, if false will use no names, if a list is given, the name of
       each level in the index will be taken from the list.
    ndupe_l - (Optional), list of ints, the number of rows for which the
       label will repeated at the corresponding level, you can specify just
       the first few, the rest will use the default ndupe_l of 1.
       len(ndupe_l) <= nlevels.
    idx_type - "i"/"f"/"s"/"u"/"dt"/"p"/"td".
       If idx_type is not None, `idx_nlevels` must be 1.
       "i"/"f" creates an integer/float index,
       "s"/"u" creates a string/unicode index
       "dt" create a datetime index.
       "td" create a datetime index.

        if unspecified, string labels will be generated.
    """
    if ndupe_l is None:
        ndupe_l = [1] * nlevels
    assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels
    assert names is None or names is False or names is True or len(
        names) is nlevels
    assert idx_type is None or (idx_type in ("i", "f", "s", "u", "dt", "p",
                                             "td") and nlevels == 1)

    if names is True:
        # build default names
        names = [prefix + str(i) for i in range(nlevels)]
    if names is False:
        # pass None to index constructor for no name
        names = None

    # make singleton case uniform
    if isinstance(names, str) and nlevels == 1:
        names = [names]

    # specific 1D index type requested?
    idx_func_dict: dict[str, Callable[..., Index]] = {
        "i": makeIntIndex,
        "f": makeFloatIndex,
        "s": makeStringIndex,
        "u": makeUnicodeIndex,
        "dt": makeDateIndex,
        "td": makeTimedeltaIndex,
        "p": makePeriodIndex,
    }
    idx_func = idx_func_dict.get(idx_type)
    if idx_func:
        idx = idx_func(nentries)
        # but we need to fill in the name
        if names:
            idx.name = names[0]
        return idx
    elif idx_type is not None:
        raise ValueError(
            f"{repr(idx_type)} is not a legal value for `idx_type`, "
            "use  'i'/'f'/'s'/'u'/'dt'/'p'/'td'.")

    if len(ndupe_l) < nlevels:
        ndupe_l.extend([1] * (nlevels - len(ndupe_l)))
    assert len(ndupe_l) == nlevels

    assert all(x > 0 for x in ndupe_l)

    list_of_lists = []
    for i in range(nlevels):

        def keyfunc(x):
            import re

            numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_")
            return [int(num) for num in numeric_tuple]

        # build a list of lists to create the index from
        div_factor = nentries // ndupe_l[i] + 1

        # Deprecated since version 3.9: collections.Counter now supports []. See PEP 585
        # and Generic Alias Type.
        cnt: Counter[str] = collections.Counter()
        for j in range(div_factor):
            label = f"{prefix}_l{i}_g{j}"
            cnt[label] = ndupe_l[i]
        # cute Counter trick
        result = sorted(cnt.elements(), key=keyfunc)[:nentries]
        list_of_lists.append(result)

    tuples = list(zip(*list_of_lists))

    # convert tuples to index
    if nentries == 1:
        # we have a single level of tuples, i.e. a regular Index
        index = Index(tuples[0], name=names[0])
    elif nlevels == 1:
        name = None if names is None else names[0]
        index = Index((x[0] for x in tuples), name=name)
    else:
        index = MultiIndex.from_tuples(tuples, names=names)
    return index
Example #56
0
 def test_constructor_dict_of_tuples(self):
     data = {(1, 2): 3, (None, 5): 6}
     result = Series(data).sort_values()
     expected = Series([3, 6],
                       index=MultiIndex.from_tuples([(1, 2), (None, 5)]))
     tm.assert_series_equal(result, expected)
Example #57
0
    def test_per_axis_per_level_setitem(self):

        # test index maker
        idx = pd.IndexSlice

        # test multi-index slicing with per axis and per index controls
        index = MultiIndex.from_tuples([("A", 1), ("A", 2), ("A", 3),
                                        ("B", 1)],
                                       names=["one", "two"])
        columns = MultiIndex.from_tuples(
            [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")],
            names=["lvl0", "lvl1"],
        )

        df_orig = DataFrame(np.arange(16, dtype="int64").reshape(4, 4),
                            index=index,
                            columns=columns)
        df_orig = df_orig.sort_index(axis=0).sort_index(axis=1)

        # identity
        df = df_orig.copy()
        df.loc[(slice(None), slice(None)), :] = 100
        expected = df_orig.copy()
        expected.iloc[:, :] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc(axis=0)[:, :] = 100
        expected = df_orig.copy()
        expected.iloc[:, :] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100
        expected = df_orig.copy()
        expected.iloc[:, :] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[:, (slice(None), slice(None))] = 100
        expected = df_orig.copy()
        expected.iloc[:, :] = 100
        tm.assert_frame_equal(df, expected)

        # index
        df = df_orig.copy()
        df.loc[(slice(None), [1]), :] = 100
        expected = df_orig.copy()
        expected.iloc[[0, 3]] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[(slice(None), 1), :] = 100
        expected = df_orig.copy()
        expected.iloc[[0, 3]] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc(axis=0)[:, 1] = 100
        expected = df_orig.copy()
        expected.iloc[[0, 3]] = 100
        tm.assert_frame_equal(df, expected)

        # columns
        df = df_orig.copy()
        df.loc[:, (slice(None), ["foo"])] = 100
        expected = df_orig.copy()
        expected.iloc[:, [1, 3]] = 100
        tm.assert_frame_equal(df, expected)

        # both
        df = df_orig.copy()
        df.loc[(slice(None), 1), (slice(None), ["foo"])] = 100
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[idx[:, 1], idx[:, ["foo"]]] = 100
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] = 100
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc["A", "a"] = 100
        expected = df_orig.copy()
        expected.iloc[0:3, 0:2] = 100
        tm.assert_frame_equal(df, expected)

        # setting with a list-like
        df = df_orig.copy()
        df.loc[(slice(None), 1),
               (slice(None), ["foo"])] = np.array([[100, 100], [100, 100]],
                                                  dtype="int64")
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] = 100
        tm.assert_frame_equal(df, expected)

        # not enough values
        df = df_orig.copy()

        msg = "setting an array element with a sequence."
        with pytest.raises(ValueError, match=msg):
            df.loc[(slice(None), 1),
                   (slice(None), ["foo"])] = np.array([[100], [100, 100]],
                                                      dtype="int64")

        msg = "Must have equal len keys and value when setting with an iterable"
        with pytest.raises(ValueError, match=msg):
            df.loc[(slice(None), 1),
                   (slice(None), ["foo"])] = np.array([100, 100, 100, 100],
                                                      dtype="int64")

        # with an alignable rhs
        df = df_orig.copy()
        df.loc[(slice(None), 1),
               (slice(None), ["foo"])] = (df.loc[(slice(None), 1),
                                                 (slice(None), ["foo"])] * 5)
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[(slice(None), 1),
               (slice(None), ["foo"])] *= df.loc[(slice(None), 1),
                                                 (slice(None), ["foo"])]
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]]
        tm.assert_frame_equal(df, expected)

        rhs = df_orig.loc[(slice(None), 1), (slice(None), ["foo"])].copy()
        rhs.loc[:, ("c", "bah")] = 10
        df = df_orig.copy()
        df.loc[(slice(None), 1), (slice(None), ["foo"])] *= rhs
        expected = df_orig.copy()
        expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]]
        tm.assert_frame_equal(df, expected)
Example #58
0
    def test_per_axis_per_level_doc_examples(self):

        # test index maker
        idx = pd.IndexSlice

        # from indexing.rst / advanced
        index = MultiIndex.from_product(
            [_mklbl("A", 4),
             _mklbl("B", 2),
             _mklbl("C", 4),
             _mklbl("D", 2)])
        columns = MultiIndex.from_tuples(
            [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")],
            names=["lvl0", "lvl1"],
        )
        df = DataFrame(
            np.arange(len(index) * len(columns), dtype="int64").reshape(
                (len(index), len(columns))),
            index=index,
            columns=columns,
        )
        result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :]
        expected = df.loc[[(
            a,
            b,
            c,
            d,
        ) for a, b, c, d in df.index.values if (
            a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3")]]
        tm.assert_frame_equal(result, expected)
        result = df.loc[idx["A1":"A3", :, ["C1", "C3"]], :]
        tm.assert_frame_equal(result, expected)

        result = df.loc[(slice(None), slice(None), ["C1", "C3"]), :]
        expected = df.loc[[(
            a,
            b,
            c,
            d,
        ) for a, b, c, d in df.index.values if (c == "C1" or c == "C3")]]
        tm.assert_frame_equal(result, expected)
        result = df.loc[idx[:, :, ["C1", "C3"]], :]
        tm.assert_frame_equal(result, expected)

        # not sorted
        msg = ("MultiIndex slicing requires the index to be lexsorted: "
               r"slicing on levels \[1\], lexsort depth 1")
        with pytest.raises(UnsortedIndexError, match=msg):
            df.loc["A1", ("a", slice("foo"))]

        # GH 16734: not sorted, but no real slicing
        tm.assert_frame_equal(df.loc["A1", (slice(None), "foo")],
                              df.loc["A1"].iloc[:, [0, 2]])

        df = df.sort_index(axis=1)

        # slicing
        df.loc["A1", (slice(None), "foo")]
        df.loc[(slice(None), slice(None), ["C1", "C3"]), (slice(None), "foo")]

        # setitem
        df.loc(axis=0)[:, :, ["C1", "C3"]] = -10
Example #59
0
def test_multiindex_unique():
    mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)])
    assert mi.is_unique is True

    result = hash_pandas_object(mi)
    assert result.is_unique is True
Example #60
0
    def test_equals(self):
        s1 = pd.Series([1, 2, 3], index=[0, 2, 1])
        s2 = s1.copy()
        assert s1.equals(s2)

        s1[1] = 99
        assert not s1.equals(s2)

        # NaNs compare as equal
        s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3])
        s2 = s1.copy()
        assert s1.equals(s2)

        s2[0] = 9.9
        assert not s1.equals(s2)

        idx = MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c")])
        s1 = Series([1, 2, np.nan], index=idx)
        s2 = s1.copy()
        assert s1.equals(s2)

        # Add object dtype column with nans
        index = np.random.random(10)
        df1 = DataFrame(np.random.random(10), index=index, columns=["floats"])
        df1["text"] = "the sky is so blue. we could use more chocolate.".split(
        )
        df1["start"] = date_range("2000-1-1", periods=10, freq="T")
        df1["end"] = date_range("2000-1-1", periods=10, freq="D")
        df1["diff"] = df1["end"] - df1["start"]
        df1["bool"] = np.arange(10) % 3 == 0
        df1.loc[::2] = np.nan
        df2 = df1.copy()
        assert df1["text"].equals(df2["text"])
        assert df1["start"].equals(df2["start"])
        assert df1["end"].equals(df2["end"])
        assert df1["diff"].equals(df2["diff"])
        assert df1["bool"].equals(df2["bool"])
        assert df1.equals(df2)
        assert not df1.equals(object)

        # different dtype
        different = df1.copy()
        different["floats"] = different["floats"].astype("float32")
        assert not df1.equals(different)

        # different index
        different_index = -index
        different = df2.set_index(different_index)
        assert not df1.equals(different)

        # different columns
        different = df2.copy()
        different.columns = df2.columns[::-1]
        assert not df1.equals(different)

        # DatetimeIndex
        index = pd.date_range("2000-1-1", periods=10, freq="T")
        df1 = df1.set_index(index)
        df2 = df1.copy()
        assert df1.equals(df2)

        # MultiIndex
        df3 = df1.set_index(["text"], append=True)
        df2 = df1.set_index(["text"], append=True)
        assert df3.equals(df2)

        df2 = df1.set_index(["floats"], append=True)
        assert not df3.equals(df2)

        # NaN in index
        df3 = df1.set_index(["floats"], append=True)
        df2 = df1.set_index(["floats"], append=True)
        assert df3.equals(df2)

        # GH 8437
        a = pd.Series([False, np.nan])
        b = pd.Series([False, np.nan])
        c = pd.Series(index=range(2), dtype=object)
        d = c.copy()
        e = c.copy()
        f = c.copy()
        c[:-1] = d[:-1] = e[0] = f[0] = False
        assert a.equals(a)
        assert a.equals(b)
        assert a.equals(c)
        assert a.equals(d)
        assert a.equals(e)
        assert e.equals(f)