Beispiel #1
0
    def test_set_index_names(self):
        df = pd.util.testing.makeDataFrame()
        df.index.name = 'name'

        assert df.set_index(df.index).index.names == ['name']

        mi = MultiIndex.from_arrays(df[['A', 'B']].T.values, names=['A', 'B'])
        mi2 = MultiIndex.from_arrays(df[['A', 'B', 'A', 'B']].T.values,
                                     names=['A', 'B', 'C', 'D'])

        df = df.set_index(['A', 'B'])

        assert df.set_index(df.index).index.names == ['A', 'B']

        # Check that set_index isn't converting a MultiIndex into an Index
        assert isinstance(df.set_index(df.index).index, MultiIndex)

        # Check actual equality
        tm.assert_index_equal(df.set_index(df.index).index, mi)

        idx2 = df.index.rename(['C', 'D'])

        # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather
        # than a pair of tuples
        assert isinstance(df.set_index([df.index, idx2]).index, MultiIndex)

        # Check equality
        tm.assert_index_equal(df.set_index([df.index, idx2]).index, mi2)
    def test_set_index_names(self):
        df = pd.util.testing.makeDataFrame()
        df.index.name = 'name'

        self.assertEqual(df.set_index(df.index).index.names, ['name'])

        mi = MultiIndex.from_arrays(df[['A', 'B']].T.values, names=['A', 'B'])
        mi2 = MultiIndex.from_arrays(df[['A', 'B', 'A', 'B']].T.values,
                                     names=['A', 'B', 'A', 'B'])

        df = df.set_index(['A', 'B'])

        self.assertEqual(df.set_index(df.index).index.names, ['A', 'B'])

        # Check that set_index isn't converting a MultiIndex into an Index
        self.assertTrue(isinstance(df.set_index(df.index).index, MultiIndex))

        # Check actual equality
        tm.assert_index_equal(df.set_index(df.index).index, mi)

        # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather
        # than a pair of tuples
        self.assertTrue(isinstance(df.set_index(
            [df.index, df.index]).index, MultiIndex))

        # Check equality
        tm.assert_index_equal(df.set_index([df.index, df.index]).index, mi2)
Beispiel #3
0
def test_append_mixed_dtypes():
    # GH 13660
    dti = date_range('2011-01-01', freq='M', periods=3, )
    dti_tz = date_range('2011-01-01', freq='M', periods=3, tz='US/Eastern')
    pi = period_range('2011-01', freq='M', periods=3)

    mi = MultiIndex.from_arrays([[1, 2, 3],
                                 [1.1, np.nan, 3.3],
                                 ['a', 'b', 'c'],
                                 dti, dti_tz, pi])
    assert mi.nlevels == 6

    res = mi.append(mi)
    exp = MultiIndex.from_arrays([[1, 2, 3, 1, 2, 3],
                                  [1.1, np.nan, 3.3, 1.1, np.nan, 3.3],
                                  ['a', 'b', 'c', 'a', 'b', 'c'],
                                  dti.append(dti),
                                  dti_tz.append(dti_tz),
                                  pi.append(pi)])
    tm.assert_index_equal(res, exp)

    other = MultiIndex.from_arrays([['x', 'y', 'z'], ['x', 'y', 'z'],
                                    ['x', 'y', 'z'], ['x', 'y', 'z'],
                                    ['x', 'y', 'z'], ['x', 'y', 'z']])

    res = mi.append(other)
    exp = MultiIndex.from_arrays([[1, 2, 3, 'x', 'y', 'z'],
                                  [1.1, np.nan, 3.3, 'x', 'y', 'z'],
                                  ['a', 'b', 'c', 'x', 'y', 'z'],
                                  dti.append(pd.Index(['x', 'y', 'z'])),
                                  dti_tz.append(pd.Index(['x', 'y', 'z'])),
                                  pi.append(pd.Index(['x', 'y', 'z']))])
    tm.assert_index_equal(res, exp)
Beispiel #4
0
def test_dropna():
    # GH 6194
    idx = pd.MultiIndex.from_arrays([[1, np.nan, 3, np.nan, 5],
                                     [1, 2, np.nan, np.nan, 5],
                                     ['a', 'b', 'c', np.nan, 'e']])

    exp = pd.MultiIndex.from_arrays([[1, 5],
                                     [1, 5],
                                     ['a', 'e']])
    tm.assert_index_equal(idx.dropna(), exp)
    tm.assert_index_equal(idx.dropna(how='any'), exp)

    exp = pd.MultiIndex.from_arrays([[1, np.nan, 3, 5],
                                     [1, 2, np.nan, 5],
                                     ['a', 'b', 'c', 'e']])
    tm.assert_index_equal(idx.dropna(how='all'), exp)

    msg = "invalid how option: xxx"
    with pytest.raises(ValueError, match=msg):
        idx.dropna(how='xxx')

    # GH26408
    # test if missing values are dropped for mutiindex constructed
    # from codes and values
    idx = MultiIndex(levels=[[np.nan, None, pd.NaT, "128", 2],
                             [np.nan, None, pd.NaT, "128", 2]],
                     codes=[[0, -1, 1, 2, 3, 4],
                            [0, -1, 3, 3, 3, 4]])
    expected = MultiIndex.from_arrays([["128", 2], ["128", 2]])
    tm.assert_index_equal(idx.dropna(), expected)
    tm.assert_index_equal(idx.dropna(how='any'), expected)

    expected = MultiIndex.from_arrays([[np.nan, np.nan, "128", 2],
                                       ["128", "128", "128", 2]])
    tm.assert_index_equal(idx.dropna(how='all'), expected)
Beispiel #5
0
def test_get_duplicates():
    # GH5873
    for a in [101, 102]:
        mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]])
        assert not mi.has_duplicates

        with tm.assert_produces_warning(FutureWarning):
            # Deprecated - see GH20239
            assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []]))

        tm.assert_numpy_array_equal(mi.duplicated(),
                                    np.zeros(2, dtype='bool'))

    for n in range(1, 6):  # 1st level shape
        for m in range(1, 5):  # 2nd level shape
            # all possible unique combinations, including nan
            codes = product(range(-1, n), range(-1, m))
            mi = MultiIndex(levels=[list('abcde')[:n], list('WXYZ')[:m]],
                            codes=np.random.permutation(list(codes)).T)
            assert len(mi) == (n + 1) * (m + 1)
            assert not mi.has_duplicates

            with tm.assert_produces_warning(FutureWarning):
                # Deprecated - see GH20239
                assert mi.get_duplicates().equals(MultiIndex.from_arrays(
                    [[], []]))

            tm.assert_numpy_array_equal(mi.duplicated(),
                                        np.zeros(len(mi), dtype='bool'))
Beispiel #6
0
    def test_pivot_datetime_tz(self):
        dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00',
                  '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00']
        dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00',
                  '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00']
        df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
                        'dt1': dates1, 'dt2': dates2,
                        'value1': range(6), 'value2': [1, 2] * 3})
        df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific'))
        df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo'))

        exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00',
                                    '2011-07-19 09:00:00'], tz='US/Pacific', name='dt1')
        exp_col1 = Index(['value1', 'value1'])
        exp_col2 = Index(['a', 'b'], name='label')
        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
        expected = DataFrame([[0, 3], [1, 4], [2, 5]],
                             index=exp_idx, columns=exp_col)
        result = pivot_table(df, index=['dt1'], columns=['label'], values=['value1'])
        tm.assert_frame_equal(result, expected)


        exp_col1 = Index(['sum', 'sum', 'sum', 'sum', 'mean', 'mean', 'mean', 'mean'])
        exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2)
        exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00', '2013-02-01 15:00:00'] * 4,
                                    tz='Asia/Tokyo', name='dt2')
        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3])
        expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1],
                              [2, 5, 1, 2, 2, 5, 1, 2]]), index=exp_idx, columns=exp_col)

        result = pivot_table(df, index=['dt1'], columns=['dt2'], values=['value1', 'value2'],
                             aggfunc=[np.sum, np.mean])
        tm.assert_frame_equal(result, expected)
Beispiel #7
0
    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'],
                                         [1, 2, 3, 1, 2, 3]],
                                        names=['first', 'second'])

        index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'],
                                         [1, 2, 3, 1, 2, 3]],
                                        names=['first', 'second'])

        df1 = DataFrame(data=np.random.randn(6), index=index1,
                        columns=['var X'])
        df2 = DataFrame(data=np.random.randn(6), index=index2,
                        columns=['var Y'])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how='outer')
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how='outer').sort_index(level=0)
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names
Beispiel #8
0
    def test_concat_with_group_keys(self):
        df = DataFrame(np.random.randn(4, 3))
        df2 = DataFrame(np.random.randn(4, 4))

        # axis=0
        df = DataFrame(np.random.randn(3, 4))
        df2 = DataFrame(np.random.randn(4, 4))

        result = concat([df, df2], keys=[0, 1])
        exp_index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1, 1],
                                            [0, 1, 2, 0, 1, 2, 3]])
        expected = DataFrame(np.r_[df.values, df2.values],
                             index=exp_index)
        tm.assert_frame_equal(result, expected)

        result = concat([df, df], keys=[0, 1])
        exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1],
                                             [0, 1, 2, 0, 1, 2]])
        expected = DataFrame(np.r_[df.values, df.values],
                             index=exp_index2)
        tm.assert_frame_equal(result, expected)

        # axis=1
        df = DataFrame(np.random.randn(4, 3))
        df2 = DataFrame(np.random.randn(4, 4))

        result = concat([df, df2], keys=[0, 1], axis=1)
        expected = DataFrame(np.c_[df.values, df2.values],
                             columns=exp_index)
        tm.assert_frame_equal(result, expected)

        result = concat([df, df], keys=[0, 1], axis=1)
        expected = DataFrame(np.c_[df.values, df.values],
                             columns=exp_index2)
        tm.assert_frame_equal(result, expected)
 def test_unstack_bool(self):
     df = DataFrame([False, False], index=MultiIndex.from_arrays([["a", "b"], ["c", "l"]]), columns=["col"])
     rs = df.unstack()
     xp = DataFrame(
         np.array([[False, np.nan], [np.nan, False]], dtype=object),
         index=["a", "b"],
         columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]),
     )
     assert_frame_equal(rs, xp)
Beispiel #10
0
 def test_unstack_bool(self):
     df = DataFrame([False, False],
                    index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]),
                    columns=['col'])
     rs = df.unstack()
     xp = DataFrame(np.array([[False, np.nan], [np.nan, False]],
                             dtype=object),
                    index=['a', 'b'],
                    columns=MultiIndex.from_arrays([['col', 'col'],
                                                    ['c', 'l']]))
     assert_frame_equal(rs, xp)
    def setup(self):
        n = 1182720
        low, high = -4096, 4096
        arrs = [np.repeat(np.random.randint(low, high, (n // k)), k)
                for k in [11, 7, 5, 3, 1]]
        self.mi_int = MultiIndex.from_arrays(arrs)[np.random.permutation(n)]

        a = np.repeat(np.arange(100), 1000)
        b = np.tile(np.arange(1000), 100)
        self.mi = MultiIndex.from_arrays([a, b])
        self.mi = self.mi.take(np.random.permutation(np.arange(100000)))
Beispiel #12
0
    def _to_narrow(self, data, mask, dates, assets):
        """
        Convert raw computed pipeline results into a DataFrame for public APIs.

        Parameters
        ----------
        data : dict[str -> ndarray[ndim=2]]
            Dict mapping column names to computed results.
        mask : ndarray[bool, ndim=2]
            Mask array of values to keep.
        dates : ndarray[datetime64, ndim=1]
            Row index for arrays `data` and `mask`
        assets : ndarray[int64, ndim=2]
            Column index for arrays `data` and `mask`

        Returns
        -------
        results : pd.DataFrame
            The indices of `results` are as follows:

            index : two-tiered MultiIndex of (date, asset).
                Contains an entry for each (date, asset) pair corresponding to
                a `True` value in `mask`.
            columns : Index of str
                One column per entry in `data`.

        If mask[date, asset] is True, then result.loc[(date, asset), colname]
        will contain the value of data[colname][date, asset].
        """
        if not mask.any():
            # Manually handle the empty DataFrame case. This is a workaround
            # to pandas failing to tz_localize an empty dataframe with a
            # MultiIndex. It also saves us the work of applying a known-empty
            # mask to each array.
            #
            # Slicing `dates` here to preserve pandas metadata.
            empty_dates = dates[:0]
            empty_assets = array([], dtype=object)
            return DataFrame(
                data={
                    name: array([], dtype=arr.dtype)
                    for name, arr in iteritems(data)
                },
                index=MultiIndex.from_arrays([empty_dates, empty_assets]),
            )

        resolved_assets = array(self._finder.retrieve_all(assets))
        dates_kept = repeat_last_axis(dates.values, len(assets))[mask]
        assets_kept = repeat_first_axis(resolved_assets, len(dates))[mask]
        return DataFrame(
            data={name: arr[mask] for name, arr in iteritems(data)},
            index=MultiIndex.from_arrays([dates_kept, assets_kept]),
        ).tz_localize('UTC', level=0)
Beispiel #13
0
def test_from_arrays(idx):
    arrays = [np.asarray(lev).take(level_codes)
              for lev, level_codes in zip(idx.levels, idx.codes)]

    # list of arrays as input
    result = MultiIndex.from_arrays(arrays, names=idx.names)
    tm.assert_index_equal(result, idx)

    # infer correctly
    result = MultiIndex.from_arrays([[pd.NaT, Timestamp('20130101')],
                                     ['a', 'b']])
    assert result.levels[0].equals(Index([Timestamp('20130101')]))
    assert result.levels[1].equals(Index(['a', 'b']))
Beispiel #14
0
def test_from_arrays_iterator(idx):
    # GH 18434
    arrays = [np.asarray(lev).take(level_codes)
              for lev, level_codes in zip(idx.levels, idx.codes)]

    # iterator as input
    result = MultiIndex.from_arrays(iter(arrays), names=idx.names)
    tm.assert_index_equal(result, idx)

    # invalid iterator input
    msg = "Input must be a list / sequence of array-likes."
    with pytest.raises(TypeError, match=msg):
        MultiIndex.from_arrays(0)
Beispiel #15
0
    def test_pivot_datetime_tz(self):
        dates1 = [
            "2011-07-19 07:00:00",
            "2011-07-19 08:00:00",
            "2011-07-19 09:00:00",
            "2011-07-19 07:00:00",
            "2011-07-19 08:00:00",
            "2011-07-19 09:00:00",
        ]
        dates2 = [
            "2013-01-01 15:00:00",
            "2013-01-01 15:00:00",
            "2013-01-01 15:00:00",
            "2013-02-01 15:00:00",
            "2013-02-01 15:00:00",
            "2013-02-01 15:00:00",
        ]
        df = DataFrame(
            {
                "label": ["a", "a", "a", "b", "b", "b"],
                "dt1": dates1,
                "dt2": dates2,
                "value1": np.arange(6, dtype="int64"),
                "value2": [1, 2] * 3,
            }
        )
        df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific"))
        df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo"))

        exp_idx = pd.DatetimeIndex(
            ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], tz="US/Pacific", name="dt1"
        )
        exp_col1 = Index(["value1", "value1"])
        exp_col2 = Index(["a", "b"], name="label")
        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
        expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col)
        result = pivot_table(df, index=["dt1"], columns=["label"], values=["value1"])
        tm.assert_frame_equal(result, expected)

        exp_col1 = Index(["sum", "sum", "sum", "sum", "mean", "mean", "mean", "mean"])
        exp_col2 = Index(["value1", "value1", "value2", "value2"] * 2)
        exp_col3 = pd.DatetimeIndex(["2013-01-01 15:00:00", "2013-02-01 15:00:00"] * 4, tz="Asia/Tokyo", name="dt2")
        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3])
        expected = DataFrame(
            np.array([[0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1], [2, 5, 1, 2, 2, 5, 1, 2]], dtype="int64"),
            index=exp_idx,
            columns=exp_col,
        )

        result = pivot_table(df, index=["dt1"], columns=["dt2"], values=["value1", "value2"], aggfunc=[np.sum, np.mean])
        tm.assert_frame_equal(result, expected)
Beispiel #16
0
def test_unique_datetimelike():
    idx1 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01',
                          '2015-01-01', 'NaT', 'NaT'])
    idx2 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-02',
                          '2015-01-02', 'NaT', '2015-01-01'],
                         tz='Asia/Tokyo')
    result = MultiIndex.from_arrays([idx1, idx2]).unique()

    eidx1 = DatetimeIndex(['2015-01-01', '2015-01-01', 'NaT', 'NaT'])
    eidx2 = DatetimeIndex(['2015-01-01', '2015-01-02',
                           'NaT', '2015-01-01'],
                          tz='Asia/Tokyo')
    exp = MultiIndex.from_arrays([eidx1, eidx2])
    tm.assert_index_equal(result, exp)
Beispiel #17
0
def test_from_arrays_iterator(idx):
    # GH 18434
    arrays = []
    for lev, lab in zip(idx.levels, idx.labels):
        arrays.append(np.asarray(lev).take(lab))

    # iterator as input
    result = MultiIndex.from_arrays(iter(arrays), names=idx.names)
    tm.assert_index_equal(result, idx)

    # invalid iterator input
    with tm.assert_raises_regex(
            TypeError, "Input must be a list / sequence of array-likes."):
        MultiIndex.from_arrays(0)
Beispiel #18
0
def test_isin():
    values = [('foo', 2), ('bar', 3), ('quux', 4)]

    idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'], np.arange(
        4)])
    result = idx.isin(values)
    expected = np.array([False, False, True, True])
    tm.assert_numpy_array_equal(result, expected)

    # empty, return dtype bool
    idx = MultiIndex.from_arrays([[], []])
    result = idx.isin(values)
    assert len(result) == 0
    assert result.dtype == np.bool_
Beispiel #19
0
    def deserialize(self, item, force_bytes_to_unicode=False):
        index = self._index_from_records(item)
        column_fields = [x for x in item.dtype.names if x not in item.dtype.metadata['index']]
        multi_column = item.dtype.metadata.get('multi_column')
        if len(item) == 0:
            rdata = item[column_fields] if len(column_fields) > 0 else None
            if multi_column is not None:
                columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])
                return DataFrame(rdata, index=index, columns=columns)
            else:
                return DataFrame(rdata, index=index)

        columns = item.dtype.metadata['columns']
        df = DataFrame(data=item[column_fields], index=index, columns=columns)

        if multi_column is not None:
            df.columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])

        if force_bytes_to_unicode:
            # This is needed due to 'str' type in py2 when read back in py3 is 'bytes' which breaks the workflow
            # of people migrating to py3. # https://github.com/manahl/arctic/issues/598
            # This should not be used for a normal flow, and you should instead of writing unicode strings
            # if you want to work with str in py3.,

            for c in df.select_dtypes(object):
                # The conversion is not using astype similar to the index as pandas has a bug where it tries to convert
                # the data columns to a unicode string, and the object in this case would be bytes, eg. b'abc'
                # which is converted to u"b'abc'" i.e it includes the b character as well! This generally happens
                # when there is a str conversion without specifying the encoding. eg. str(b'abc') -> "b'abc'" and the
                # fix for this is to tell it the encoding to use: i.e str(b'abc', 'utf-8') -> "abc"
                if type(df[c].iloc[0]) == bytes:
                    df[c] = df[c].str.decode('utf-8')

            if isinstance(df.index, MultiIndex):
                unicode_indexes = []
                # MultiIndex requires a conversion at each level.
                for level in range(len(df.index.levels)):
                    _index = df.index.get_level_values(level)
                    if isinstance(_index[0], bytes):
                        _index = _index.astype('unicode')
                    unicode_indexes.append(_index)
                df.index = unicode_indexes
            else:
                if type(df.index[0]) == bytes:
                    df.index = df.index.astype('unicode')

            if type(df.columns[0]) == bytes:
                df.columns = df.index.astype('unicode')

        return df
Beispiel #20
0
    def setup(self, axis):
        self.df = DataFrame(np.random.randn(10000, 1000))
        self.df.ix[50:1000, 20:50] = np.nan
        self.df.ix[2000:3000] = np.nan
        self.df.ix[:, 60:70] = np.nan
        self.df_mixed = self.df.copy()
        self.df_mixed['foo'] = 'bar'

        self.df.index = MultiIndex.from_arrays([self.df.index, self.df.index])
        self.df.columns = MultiIndex.from_arrays([self.df.columns,
                                                  self.df.columns])
        self.df_mixed.index = MultiIndex.from_arrays([self.df_mixed.index,
                                                      self.df_mixed.index])
        self.df_mixed.columns = MultiIndex.from_arrays([self.df_mixed.columns,
                                                        self.df_mixed.columns])
Beispiel #21
0
    def test_query_with_partially_named_multiindex(self, parser, engine):
        skip_if_no_pandas_parser(parser)
        a = np.random.choice(['red', 'green'], size=10)
        b = np.arange(10)
        index = MultiIndex.from_arrays([a, b])
        index.names = [None, 'rating']
        df = DataFrame(np.random.randn(10, 2), index=index)
        res = df.query('rating == 1', parser=parser, engine=engine)
        ind = Series(df.index.get_level_values('rating').values, index=index,
                     name='rating')
        exp = df[ind == 1]
        assert_frame_equal(res, exp)

        res = df.query('rating != 1', parser=parser, engine=engine)
        ind = Series(df.index.get_level_values('rating').values, index=index,
                     name='rating')
        exp = df[ind != 1]
        assert_frame_equal(res, exp)

        res = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
        ind = Series(df.index.get_level_values(0).values, index=index)
        exp = df[ind == "red"]
        assert_frame_equal(res, exp)

        res = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
        ind = Series(df.index.get_level_values(0).values, index=index)
        exp = df[ind != "red"]
        assert_frame_equal(res, exp)
Beispiel #22
0
    def test_to_excel_multiindex_dates(self):
        _skip_if_no_xlrd()
        ext = self.ext
        path = '__tmp_to_excel_multiindex_dates__' + ext + '__.' + ext

        # try multiindex with dates
        tsframe = self.tsframe
        old_index = tsframe.index
        new_index = [old_index, np.arange(len(old_index))]
        tsframe.index = MultiIndex.from_arrays(new_index)

        with ensure_clean(path) as path:
            tsframe.to_excel(path, 'test1', index_label=['time', 'foo'])
            reader = ExcelFile(path)
            recons = reader.parse('test1', index_col=[0, 1])

            tm.assert_frame_equal(tsframe, recons, check_names=False)
            self.assertEquals(recons.index.names, ('time', 'foo'))

            # infer index
            tsframe.to_excel(path, 'test1')
            reader = ExcelFile(path)
            recons = reader.parse('test1')
            tm.assert_frame_equal(tsframe, recons)

            self.tsframe.index = old_index  # needed if setUP becomes classmethod
Beispiel #23
0
    def test_constructor(self):
        assert self.ts.index.is_all_dates

        # Pass in Series
        derived = Series(self.ts)
        assert derived.index.is_all_dates

        assert tm.equalContents(derived.index, self.ts.index)
        # Ensure new index is not created
        assert id(self.ts.index) == id(derived.index)

        # Mixed type Series
        mixed = Series(['hello', np.NaN], index=[0, 1])
        assert mixed.dtype == np.object_
        assert mixed[1] is np.NaN

        assert not self.empty.index.is_all_dates
        assert not Series({}).index.is_all_dates
        pytest.raises(Exception, Series, np.random.randn(3, 3),
                      index=np.arange(3))

        mixed.name = 'Series'
        rs = Series(mixed).name
        xp = 'Series'
        assert rs == xp

        # raise on MultiIndex GH4187
        m = MultiIndex.from_arrays([[1, 2], [3, 4]])
        pytest.raises(NotImplementedError, Series, m)
Beispiel #24
0
def test_nsmallest():
    a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
    b = Series(list('a' * 5 + 'b' * 5))
    gb = a.groupby(b)
    r = gb.nsmallest(3)
    e = Series([
        1, 2, 3, 0, 4, 6
    ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]]))
    tm.assert_series_equal(r, e)

    a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
    gb = a.groupby(b)
    e = Series([
        0, 1, 1, 0, 1, 2
    ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]]))
    tm.assert_series_equal(gb.nsmallest(3, keep='last'), e)
Beispiel #25
0
def test_get_loc_missing_nan():
    # GH 8569
    idx = MultiIndex.from_arrays([[1.0, 2.0], [3.0, 4.0]])
    assert isinstance(idx.get_loc(1), slice)
    pytest.raises(KeyError, idx.get_loc, 3)
    pytest.raises(KeyError, idx.get_loc, np.nan)
    pytest.raises(KeyError, idx.get_loc, [np.nan])
Beispiel #26
0
def test_isin_level_kwarg():
    idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'], np.arange(
        4)])

    vals_0 = ['foo', 'bar', 'quux']
    vals_1 = [2, 3, 10]

    expected = np.array([False, False, True, True])
    tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level=0))
    tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level=-2))

    tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level=1))
    tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level=-1))

    msg = "Too many levels: Index has only 2 levels, not 6"
    with pytest.raises(IndexError, match=msg):
        idx.isin(vals_0, level=5)
    msg = ("Too many levels: Index has only 2 levels, -5 is not a valid level"
           " number")
    with pytest.raises(IndexError, match=msg):
        idx.isin(vals_0, level=-5)

    with pytest.raises(KeyError, match=r"'Level 1\.0 not found'"):
        idx.isin(vals_0, level=1.0)
    with pytest.raises(KeyError, match=r"'Level -1\.0 not found'"):
        idx.isin(vals_1, level=-1.0)
    with pytest.raises(KeyError, match="'Level A not found'"):
        idx.isin(vals_1, level='A')

    idx.names = ['A', 'B']
    tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level='A'))
    tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level='B'))

    with pytest.raises(KeyError, match="'Level C not found'"):
        idx.isin(vals_1, level='C')
Beispiel #27
0
    def test_constructor(self):
        self.assertTrue(self.ts.index.is_all_dates)

        # Pass in Series
        derived = Series(self.ts)
        self.assertTrue(derived.index.is_all_dates)

        self.assertTrue(tm.equalContents(derived.index, self.ts.index))
        # Ensure new index is not created
        self.assertEqual(id(self.ts.index), id(derived.index))

        # Mixed type Series
        mixed = Series(['hello', np.NaN], index=[0, 1])
        self.assertEqual(mixed.dtype, np.object_)
        self.assertIs(mixed[1], np.NaN)

        self.assertFalse(self.empty.index.is_all_dates)
        self.assertFalse(Series({}).index.is_all_dates)
        self.assertRaises(Exception, Series, np.random.randn(3, 3),
                          index=np.arange(3))

        mixed.name = 'Series'
        rs = Series(mixed).name
        xp = 'Series'
        self.assertEqual(rs, xp)

        # raise on MultiIndex GH4187
        m = MultiIndex.from_arrays([[1, 2], [3, 4]])
        self.assertRaises(NotImplementedError, Series, m)
Beispiel #28
0
    def test_constructor(self, datetime_series, empty_series):
        assert datetime_series.index.is_all_dates

        # Pass in Series
        derived = Series(datetime_series)
        assert derived.index.is_all_dates

        assert tm.equalContents(derived.index, datetime_series.index)
        # Ensure new index is not created
        assert id(datetime_series.index) == id(derived.index)

        # Mixed type Series
        mixed = Series(['hello', np.NaN], index=[0, 1])
        assert mixed.dtype == np.object_
        assert mixed[1] is np.NaN

        assert not empty_series.index.is_all_dates
        assert not Series({}).index.is_all_dates

        # exception raised is of type Exception
        with pytest.raises(Exception, match="Data must be 1-dimensional"):
            Series(np.random.randn(3, 3), index=np.arange(3))

        mixed.name = 'Series'
        rs = Series(mixed).name
        xp = 'Series'
        assert rs == xp

        # raise on MultiIndex GH4187
        m = MultiIndex.from_arrays([[1, 2], [3, 4]])
        msg = "initializing a Series from a MultiIndex is not supported"
        with pytest.raises(NotImplementedError, match=msg):
            Series(m)
 def setup(self):
     size = 65536
     arrays = [np.random.randint(0, 8192, size),
               np.random.randint(0, 1024, size)]
     mask = np.random.rand(size) < 0.1
     self.mi_unused_levels = MultiIndex.from_arrays(arrays)
     self.mi_unused_levels = self.mi_unused_levels[mask]
Beispiel #30
0
 def test_empty_with_reversed_multiindex(self):
     data = 'x,y,z'
     result = self.read_csv(StringIO(data), index_col=[1, 0])
     expected = DataFrame([], columns=['z'],
                          index=MultiIndex.from_arrays(
                              [[]] * 2, names=['y', 'x']))
     tm.assert_frame_equal(result, expected, check_index_type=False)
Beispiel #31
0
def test_from_arrays_different_lengths(idx1, idx2):
    # see gh-13599
    msg = "^all arrays must be same length$"
    with pytest.raises(ValueError, match=msg):
        MultiIndex.from_arrays([idx1, idx2])
     "columns": ["x", "z"],
     "index": Index([], name="y")
 }),
 ("x", {
     "columns": ["y", "z"],
     "index": Index([], name="x")
 }),
 ("y", {
     "columns": ["x", "z"],
     "index": Index([], name="y")
 }),
 (
     [0, 1],
     {
         "columns": ["z"],
         "index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
     },
 ),
 (
     ["x", "y"],
     {
         "columns": ["z"],
         "index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
     },
 ),
 (
     [1, 0],
     {
         "columns": ["z"],
         "index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]),
     },
def test_is_monotonic_decreasing():
    i = MultiIndex.from_product(
        [np.arange(9, -1, -1), np.arange(9, -1, -1)], names=["one", "two"]
    )
    assert i.is_monotonic_decreasing is True
    assert i._is_strictly_monotonic_decreasing is True
    assert Index(i.values).is_monotonic_decreasing is True
    assert i._is_strictly_monotonic_decreasing is True

    i = MultiIndex.from_product(
        [np.arange(10), np.arange(10, 0, -1)], names=["one", "two"]
    )
    assert i.is_monotonic_decreasing is False
    assert i._is_strictly_monotonic_decreasing is False
    assert Index(i.values).is_monotonic_decreasing is False
    assert Index(i.values)._is_strictly_monotonic_decreasing is False

    i = MultiIndex.from_product(
        [np.arange(10, 0, -1), np.arange(10)], names=["one", "two"]
    )
    assert i.is_monotonic_decreasing is False
    assert i._is_strictly_monotonic_decreasing is False
    assert Index(i.values).is_monotonic_decreasing is False
    assert Index(i.values)._is_strictly_monotonic_decreasing is False

    i = MultiIndex.from_product([[2.0, np.nan, 1.0], ["c", "b", "a"]])
    assert i.is_monotonic_decreasing is False
    assert i._is_strictly_monotonic_decreasing is False
    assert Index(i.values).is_monotonic_decreasing is False
    assert Index(i.values)._is_strictly_monotonic_decreasing is False

    # string ordering
    i = MultiIndex(
        levels=[["qux", "foo", "baz", "bar"], ["three", "two", "one"]],
        codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
        names=["first", "second"],
    )
    assert i.is_monotonic_decreasing is False
    assert Index(i.values).is_monotonic_decreasing is False
    assert i._is_strictly_monotonic_decreasing is False
    assert Index(i.values)._is_strictly_monotonic_decreasing is False

    i = MultiIndex(
        levels=[["qux", "foo", "baz", "bar"], ["zenith", "next", "mom"]],
        codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
        names=["first", "second"],
    )
    assert i.is_monotonic_decreasing is True
    assert Index(i.values).is_monotonic_decreasing is True
    assert i._is_strictly_monotonic_decreasing is True
    assert Index(i.values)._is_strictly_monotonic_decreasing is True

    # mixed levels, hits the TypeError
    i = MultiIndex(
        levels=[
            [4, 3, 2, 1],
            [
                "nl0000301109",
                "nl0000289965",
                "nl0000289783",
                "lu0197800237",
                "gb00b03mlx29",
            ],
        ],
        codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]],
        names=["household_id", "asset_id"],
    )

    assert i.is_monotonic_decreasing is False
    assert i._is_strictly_monotonic_decreasing is False

    # empty
    i = MultiIndex.from_arrays([[], []])
    assert i.is_monotonic_decreasing is True
    assert Index(i.values).is_monotonic_decreasing is True
    assert i._is_strictly_monotonic_decreasing is True
    assert Index(i.values)._is_strictly_monotonic_decreasing is True
Beispiel #34
0
def test_index_equal_empty_iterable():
    # #16844
    a = MultiIndex(levels=[[], []], codes=[[], []], names=["a", "b"])
    b = MultiIndex.from_arrays(arrays=[[], []], names=["a", "b"])
    tm.assert_index_equal(a, b)
Beispiel #35
0
def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo):
    # issue 19132
    idx = MultiIndex.from_arrays(index_arr)
    with tm.assert_produces_warning(FutureWarning, match="'kind' argument"):
        result = idx.get_slice_bound(target, side=algo, kind="loc")
    assert result == expected
Beispiel #36
0
        def _do_test(df,
                     r_dtype=None,
                     c_dtype=None,
                     rnlvl=None,
                     cnlvl=None,
                     dupe_col=False):

            kwargs = dict(parse_dates=False)
            if cnlvl:
                if rnlvl is not None:
                    kwargs['index_col'] = lrange(rnlvl)
                kwargs['header'] = lrange(cnlvl)

                with ensure_clean('__tmp_to_csv_moar__') as path:
                    df.to_csv(path, encoding='utf8', chunksize=chunksize)
                    recons = self.read_csv(path, **kwargs)
            else:
                kwargs['header'] = 0

                with ensure_clean('__tmp_to_csv_moar__') as path:
                    df.to_csv(path, encoding='utf8', chunksize=chunksize)
                    recons = self.read_csv(path, **kwargs)

            def _to_uni(x):
                if not isinstance(x, compat.text_type):
                    return x.decode('utf8')
                return x

            if dupe_col:
                # read_Csv disambiguates the columns by
                # labeling them dupe.1,dupe.2, etc'. monkey patch columns
                recons.columns = df.columns
            if rnlvl and not cnlvl:
                delta_lvl = [
                    recons.iloc[:, i].values for i in range(rnlvl - 1)
                ]
                ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl)
                recons.index = ix
                recons = recons.iloc[:, rnlvl - 1:]

            type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O')
            if r_dtype:
                if r_dtype == 'u':  # unicode
                    r_dtype = 'O'
                    recons.index = np.array(lmap(_to_uni, recons.index),
                                            dtype=r_dtype)
                    df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype)
                elif r_dtype == 'dt':  # unicode
                    r_dtype = 'O'
                    recons.index = np.array(lmap(Timestamp, recons.index),
                                            dtype=r_dtype)
                    df.index = np.array(lmap(Timestamp, df.index),
                                        dtype=r_dtype)
                elif r_dtype == 'p':
                    r_dtype = 'O'
                    recons.index = np.array(list(
                        map(Timestamp, to_datetime(recons.index))),
                                            dtype=r_dtype)
                    df.index = np.array(list(
                        map(Timestamp, df.index.to_timestamp())),
                                        dtype=r_dtype)
                else:
                    r_dtype = type_map.get(r_dtype)
                    recons.index = np.array(recons.index, dtype=r_dtype)
                    df.index = np.array(df.index, dtype=r_dtype)
            if c_dtype:
                if c_dtype == 'u':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(_to_uni, recons.columns),
                                              dtype=c_dtype)
                    df.columns = np.array(lmap(_to_uni, df.columns),
                                          dtype=c_dtype)
                elif c_dtype == 'dt':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(Timestamp, recons.columns),
                                              dtype=c_dtype)
                    df.columns = np.array(lmap(Timestamp, df.columns),
                                          dtype=c_dtype)
                elif c_dtype == 'p':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(Timestamp,
                                                   to_datetime(
                                                       recons.columns)),
                                              dtype=c_dtype)
                    df.columns = np.array(lmap(Timestamp,
                                               df.columns.to_timestamp()),
                                          dtype=c_dtype)
                else:
                    c_dtype = type_map.get(c_dtype)
                    recons.columns = np.array(recons.columns, dtype=c_dtype)
                    df.columns = np.array(df.columns, dtype=c_dtype)

            assert_frame_equal(df,
                               recons,
                               check_names=False,
                               check_less_precise=True)
Beispiel #37
0
def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo):
    # issue 19132
    idx = MultiIndex.from_arrays(index_arr)
    result = idx.get_slice_bound(target, side=algo, kind="loc")
    assert result == expected
Beispiel #38
0
def test_get_indexer_with_missing_value(index_arr, labels, expected):
    # issue 19132
    idx = MultiIndex.from_arrays(index_arr)
    result = idx.get_indexer(labels)
    tm.assert_numpy_array_equal(result, expected)
Beispiel #39
0
def test_observed(observed):
    # multiple groupers, don't re-expand the output space
    # of the grouper
    # gh-14942 (implement)
    # gh-10132 (back-compat)
    # gh-8138 (back-compat)
    # gh-8869

    cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
    df["C"] = ["foo", "bar"] * 2

    # multiple groupers with a non-cat
    gb = df.groupby(["A", "B", "C"], observed=observed)
    exp_index = MultiIndex.from_arrays(
        [cat1, cat2, ["foo", "bar"] * 2], names=["A", "B", "C"]
    )
    expected = DataFrame({"values": Series([1, 2, 3, 4], index=exp_index)}).sort_index()
    result = gb.sum()
    if not observed:
        expected = cartesian_product_for_groupers(
            expected, [cat1, cat2, ["foo", "bar"]], list("ABC")
        )

    tm.assert_frame_equal(result, expected)

    gb = df.groupby(["A", "B"], observed=observed)
    exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"])
    expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
    result = gb.sum()
    if not observed:
        expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB"))

    tm.assert_frame_equal(result, expected)

    # https://github.com/pandas-dev/pandas/issues/8138
    d = {
        "cat": Categorical(
            ["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True
        ),
        "ints": [1, 1, 2, 2],
        "val": [10, 20, 30, 40],
    }
    df = DataFrame(d)

    # Grouping on a single column
    groups_single_key = df.groupby("cat", observed=observed)
    result = groups_single_key.mean()

    exp_index = CategoricalIndex(
        list("ab"), name="cat", categories=list("abc"), ordered=True
    )
    expected = DataFrame({"ints": [1.5, 1.5], "val": [20.0, 30]}, index=exp_index)
    if not observed:
        index = CategoricalIndex(
            list("abc"), name="cat", categories=list("abc"), ordered=True
        )
        expected = expected.reindex(index)

    tm.assert_frame_equal(result, expected)

    # Grouping on two columns
    groups_double_key = df.groupby(["cat", "ints"], observed=observed)
    result = groups_double_key.agg("mean")
    expected = DataFrame(
        {
            "val": [10, 30, 20, 40],
            "cat": Categorical(
                ["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True
            ),
            "ints": [1, 2, 1, 2],
        }
    ).set_index(["cat", "ints"])
    if not observed:
        expected = cartesian_product_for_groupers(
            expected, [df.cat.values, [1, 2]], ["cat", "ints"]
        )

    tm.assert_frame_equal(result, expected)

    # GH 10132
    for key in [("a", 1), ("b", 2), ("b", 1), ("a", 2)]:
        c, i = key
        result = groups_double_key.get_group(key)
        expected = df[(df.cat == c) & (df.ints == i)]
        tm.assert_frame_equal(result, expected)

    # gh-8869
    # with as_index
    d = {
        "foo": [10, 8, 4, 8, 4, 1, 1],
        "bar": [10, 20, 30, 40, 50, 60, 70],
        "baz": ["d", "c", "e", "a", "a", "d", "c"],
    }
    df = DataFrame(d)
    cat = pd.cut(df["foo"], np.linspace(0, 10, 3))
    df["range"] = cat
    groups = df.groupby(["range", "baz"], as_index=False, observed=observed)
    result = groups.agg("mean")

    groups2 = df.groupby(["range", "baz"], as_index=True, observed=observed)
    expected = groups2.agg("mean").reset_index()
    tm.assert_frame_equal(result, expected)
Beispiel #40
0
    tm.assert_numpy_array_equal(end, np.array(expected_end), check_dtype=False)
    assert len(start) == len(end)


@pytest.mark.parametrize(
    "df, window_size, expected",
    [
        (
            DataFrame({
                "b": [0, 1, 2],
                "a": [1, 2, 2]
            }),
            2,
            Series(
                [0, 1.5, 2.0],
                index=MultiIndex.from_arrays([[1, 2, 2], range(3)],
                                             names=["a", None]),
                name="b",
                dtype=np.float64,
            ),
        ),
        (
            DataFrame({
                "b": [np.nan, 1, 2, np.nan] + list(range(4, 18)),
                "a": [1] * 7 + [2] * 11,
                "c": range(18),
            }),
            12,
            Series(
                [
                    3.6,
                    3.6,
Beispiel #41
0
    def test_set_index2(self):
        df = DataFrame({
            'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
            'B': ['one', 'two', 'three', 'one', 'two'],
            'C': ['a', 'b', 'c', 'd', 'e'],
            'D': np.random.randn(5),
            'E': np.random.randn(5)
        })

        # new object, single-column
        result = df.set_index('C')
        result_nodrop = df.set_index('C', drop=False)

        index = Index(df['C'], name='C')

        expected = df.loc[:, ['A', 'B', 'D', 'E']]
        expected.index = index

        expected_nodrop = df.copy()
        expected_nodrop.index = index

        assert_frame_equal(result, expected)
        assert_frame_equal(result_nodrop, expected_nodrop)
        self.assertEqual(result.index.name, index.name)

        # inplace, single
        df2 = df.copy()

        df2.set_index('C', inplace=True)

        assert_frame_equal(df2, expected)

        df3 = df.copy()
        df3.set_index('C', drop=False, inplace=True)

        assert_frame_equal(df3, expected_nodrop)

        # create new object, multi-column
        result = df.set_index(['A', 'B'])
        result_nodrop = df.set_index(['A', 'B'], drop=False)

        index = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B'])

        expected = df.loc[:, ['C', 'D', 'E']]
        expected.index = index

        expected_nodrop = df.copy()
        expected_nodrop.index = index

        assert_frame_equal(result, expected)
        assert_frame_equal(result_nodrop, expected_nodrop)
        self.assertEqual(result.index.names, index.names)

        # inplace
        df2 = df.copy()
        df2.set_index(['A', 'B'], inplace=True)
        assert_frame_equal(df2, expected)

        df3 = df.copy()
        df3.set_index(['A', 'B'], drop=False, inplace=True)
        assert_frame_equal(df3, expected_nodrop)

        # corner case
        with assertRaisesRegexp(ValueError, 'Index has duplicate keys'):
            df.set_index('A', verify_integrity=True)

        # append
        result = df.set_index(['A', 'B'], append=True)
        xp = df.reset_index().set_index(['index', 'A', 'B'])
        xp.index.names = [None, 'A', 'B']
        assert_frame_equal(result, xp)

        # append to existing multiindex
        rdf = df.set_index(['A'], append=True)
        rdf = rdf.set_index(['B', 'C'], append=True)
        expected = df.set_index(['A', 'B', 'C'], append=True)
        assert_frame_equal(rdf, expected)

        # Series
        result = df.set_index(df.C)
        self.assertEqual(result.index.name, 'C')
Beispiel #42
0
    def test_to_csv_multiindex(self):

        frame = self.frame
        old_index = frame.index
        arrays = np.arange(len(old_index) * 2).reshape(2, -1)
        new_index = MultiIndex.from_arrays(arrays, names=['first', 'second'])
        frame.index = new_index

        with ensure_clean('__tmp_to_csv_multiindex__') as path:

            frame.to_csv(path, header=False)
            frame.to_csv(path, columns=['A', 'B'])

            # round trip
            frame.to_csv(path)

            df = self.read_csv(path, index_col=[0, 1], parse_dates=False)

            # TODO to_csv drops column name
            assert_frame_equal(frame, df, check_names=False)
            assert frame.index.names == df.index.names

            # needed if setUp becomes a class method
            self.frame.index = old_index

            # try multiindex with dates
            tsframe = self.tsframe
            old_index = tsframe.index
            new_index = [old_index, np.arange(len(old_index))]
            tsframe.index = MultiIndex.from_arrays(new_index)

            tsframe.to_csv(path, index_label=['time', 'foo'])
            recons = self.read_csv(path, index_col=[0, 1])

            # TODO to_csv drops column name
            assert_frame_equal(tsframe, recons, check_names=False)

            # do not load index
            tsframe.to_csv(path)
            recons = self.read_csv(path, index_col=None)
            assert len(recons.columns) == len(tsframe.columns) + 2

            # no index
            tsframe.to_csv(path, index=False)
            recons = self.read_csv(path, index_col=None)
            assert_almost_equal(recons.values, self.tsframe.values)

            # needed if setUp becomes class method
            self.tsframe.index = old_index

        with ensure_clean('__tmp_to_csv_multiindex__') as path:
            # GH3571, GH1651, GH3141

            def _make_frame(names=None):
                if names is True:
                    names = ['first', 'second']
                return DataFrame(np.random.randint(0, 10, size=(3, 3)),
                                 columns=MultiIndex.from_tuples(
                                     [('bah', 'foo'), ('bah', 'bar'),
                                      ('ban', 'baz')],
                                     names=names),
                                 dtype='int64')

            # column & index are multi-index
            df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1])
            assert_frame_equal(df, result)

            # column is mi
            df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=0)
            assert_frame_equal(df, result)

            # dup column names?
            df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2])
            assert_frame_equal(df, result)

            # writing with no index
            df = _make_frame()
            df.to_csv(path, index=False)
            result = read_csv(path, header=[0, 1])
            assert_frame_equal(df, result)

            # we lose the names here
            df = _make_frame(True)
            df.to_csv(path, index=False)
            result = read_csv(path, header=[0, 1])
            assert com._all_none(*result.columns.names)
            result.columns.names = df.columns.names
            assert_frame_equal(df, result)

            # tupleize_cols=True and index=False
            df = _make_frame(True)
            with tm.assert_produces_warning(FutureWarning):
                df.to_csv(path, tupleize_cols=True, index=False)

            with tm.assert_produces_warning(FutureWarning,
                                            check_stacklevel=False):
                result = read_csv(path,
                                  header=0,
                                  tupleize_cols=True,
                                  index_col=None)
            result.columns = df.columns
            assert_frame_equal(df, result)

            # whatsnew example
            df = _make_frame()
            df.to_csv(path)
            result = read_csv(path, header=[0, 1], index_col=[0])
            assert_frame_equal(df, result)

            df = _make_frame(True)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1], index_col=[0])
            assert_frame_equal(df, result)

            # column & index are multi-index (compatibility)
            df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
            with tm.assert_produces_warning(FutureWarning):
                df.to_csv(path, tupleize_cols=True)

            with tm.assert_produces_warning(FutureWarning,
                                            check_stacklevel=False):
                result = read_csv(path,
                                  header=0,
                                  index_col=[0, 1],
                                  tupleize_cols=True)
            result.columns = df.columns
            assert_frame_equal(df, result)

            # invalid options
            df = _make_frame(True)
            df.to_csv(path)

            for i in [6, 7]:
                msg = 'len of {i}, but only 5 lines in file'.format(i=i)
                with pytest.raises(ParserError, match=msg):
                    read_csv(path, header=lrange(i), index_col=0)

            # write with cols
            msg = 'cannot specify cols with a MultiIndex'
            with pytest.raises(TypeError, match=msg):
                df.to_csv(path, columns=['foo', 'bar'])

        with ensure_clean('__tmp_to_csv_multiindex__') as path:
            # empty
            tsframe[:0].to_csv(path)
            recons = self.read_csv(path)

            exp = tsframe[:0]
            exp.index = []

            tm.assert_index_equal(recons.columns, exp.columns)
            assert len(recons) == 0
Beispiel #43
0
def test_slice_locs_with_missing_value(index_arr, expected, start_idx, end_idx):
    # issue 19132
    idx = MultiIndex.from_arrays(index_arr)
    result = idx.slice_locs(start=start_idx, end=end_idx)
    assert result == expected
Beispiel #44
0
def test_multiindex_name_and_level_raising():
    # GH#20421
    mi = MultiIndex.from_arrays([[1, 2], [3, 4]], names=["x", "y"])
    with pytest.raises(TypeError, match="Can not pass level for dictlike `names`."):
        mi.set_names(names={"x": "z"}, level={"x": "z"})
Beispiel #45
0
def sims_data(**kwargs):
    """
    Gets trace element data grouped by sample and
    minerals. The data is formatted as a nested ordered
    dictionary, for iterability. This data can be used
    as-is or averaged, as appropriate.

    kwargs:
      exclude_bad: Exclude bad items (True)
      whole_rock: Return recalculated whole-rock data (False)
      raw: Return raw data not normalized to CI chondrite (False)
      ree_only: Return only rare-earth element data (False)
    """

    exclude_bad = kwargs.pop('exclude_bad',True)
    whole_rock = kwargs.pop('whole_rock',False)
    raw = kwargs.pop('raw',False)

    if exclude_bad:
        bad_data = datum.bad.isnot(True)
    else:
        # A meaningless filter
        bad_data = True

    if raw:
        ppm, std = (datum.raw_ppm, datum.raw_std)
    else:
        ppm, std = (datum.norm_ppm, datum.norm_std)

    q = (db.session.query(datum)
        .filter(datum.element != 14)
        .filter(bad_data)
        .join(meas)
        .group_by(
            meas.sample_id,
            meas.mineral,
            datum.element)
        .with_entities(
            meas.sample_id,
            meas.mineral,
            datum.element,
            func.array_agg(ppm)
                .label('ppm'),
            func.array_agg(std)
                .label('std'),
            func.count(ppm)
                .label('n')))

    df = read_sql(q.statement,q.session.bind)

    # Apply uncertainty
    def uncertainty(row):
        vals = tuple(zip(row['ppm'],row['std']))
        return tuple(uval(n,s) for n,s in vals)

    df['norm'] = df.apply(uncertainty,axis=1)
    df = (df
        .drop('ppm', axis=1)
        .drop('std', axis=1))

    fn = lambda x: elements[x['element']].symbol
    df['symbol'] = df.apply(fn, axis=1)
    df['average'] = df['norm'].apply(N.mean)

    names = ('sample_id','mineral','element','symbol')
    df.index = MultiIndex.from_arrays(
        [df.pop(i) for i in names])
    df.sortlevel(inplace=True)

    if kwargs.pop('ree_only',True):
        df = ree_only(df)

    # Add recalculated whole-rock data if requested
    if not whole_rock:
        return df

    xenoliths = (db.session.query(Sample)
        .filter_by(xenolith=True))
    modes = {s.id:s.modes() for s in xenoliths.all()}
    modes = DataFrame.from_dict(modes, orient='index')
    modes = modes.stack()
    modes.name = 'mode'
    ix = ['sample_id','mineral']
    modes.index.names = ix

    d1 = df.reset_index().join(modes,on=ix)
    d1['average'] *= d1.pop('mode')
    d1 = d1.groupby(['sample_id','element','symbol'])
    d1 = d1.aggregate(dict(average=sum))
    d1['mineral'] = 'whole_rock'
    d1.set_index('mineral',append=True, inplace=True)
    d1 = d1.reorder_levels(df.index.names)
    return concat([df,d1])
Beispiel #46
0
 def test_large_mi_contains(self):
     # GH#10645
     result = MultiIndex.from_arrays([range(10**6), range(10**6)])
     assert not (10**6, 0) in result
def test_isin_multi_index_with_missing_value(labels, expected, level):
    # GH 19132
    midx = MultiIndex.from_arrays([[np.nan, "a", "b"], ["c", "d", np.nan]])
    tm.assert_numpy_array_equal(midx.isin(labels, level=level), expected)
Beispiel #48
0
def test_is_monotonic_increasing():
    i = MultiIndex.from_product([np.arange(10), np.arange(10)],
                                names=['one', 'two'])
    assert i.is_monotonic is True
    assert i._is_strictly_monotonic_increasing is True
    assert Index(i.values).is_monotonic is True
    assert i._is_strictly_monotonic_increasing is True

    i = MultiIndex.from_product(
        [np.arange(10, 0, -1), np.arange(10)], names=['one', 'two'])
    assert i.is_monotonic is False
    assert i._is_strictly_monotonic_increasing is False
    assert Index(i.values).is_monotonic is False
    assert Index(i.values)._is_strictly_monotonic_increasing is False

    i = MultiIndex.from_product(
        [np.arange(10), np.arange(10, 0, -1)], names=['one', 'two'])
    assert i.is_monotonic is False
    assert i._is_strictly_monotonic_increasing is False
    assert Index(i.values).is_monotonic is False
    assert Index(i.values)._is_strictly_monotonic_increasing is False

    i = MultiIndex.from_product([[1.0, np.nan, 2.0], ['a', 'b', 'c']])
    assert i.is_monotonic is False
    assert i._is_strictly_monotonic_increasing is False
    assert Index(i.values).is_monotonic is False
    assert Index(i.values)._is_strictly_monotonic_increasing is False

    # string ordering
    i = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                           ['one', 'two', 'three']],
                   codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                          [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                   names=['first', 'second'])
    assert i.is_monotonic is False
    assert Index(i.values).is_monotonic is False
    assert i._is_strictly_monotonic_increasing is False
    assert Index(i.values)._is_strictly_monotonic_increasing is False

    i = MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'],
                           ['mom', 'next', 'zenith']],
                   codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                          [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                   names=['first', 'second'])
    assert i.is_monotonic is True
    assert Index(i.values).is_monotonic is True
    assert i._is_strictly_monotonic_increasing is True
    assert Index(i.values)._is_strictly_monotonic_increasing is True

    # mixed levels, hits the TypeError
    i = MultiIndex(levels=[[1, 2, 3, 4],
                           [
                               'gb00b03mlx29', 'lu0197800237', 'nl0000289783',
                               'nl0000289965', 'nl0000301109'
                           ]],
                   codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]],
                   names=['household_id', 'asset_id'])

    assert i.is_monotonic is False
    assert i._is_strictly_monotonic_increasing is False

    # empty
    i = MultiIndex.from_arrays([[], []])
    assert i.is_monotonic is True
    assert Index(i.values).is_monotonic is True
    assert i._is_strictly_monotonic_increasing is True
    assert Index(i.values)._is_strictly_monotonic_increasing is True
Beispiel #49
0
def test_name_mi_with_dict_like_duplicate_names(func, rename_dict, exp_names):
    # GH#20421
    mi = MultiIndex.from_arrays([[1, 2], [3, 4], [5, 6]], names=["x", "y", "x"])
    result = getattr(mi, func)(rename_dict)
    expected = MultiIndex.from_arrays([[1, 2], [3, 4], [5, 6]], names=exp_names)
    tm.assert_index_equal(result, expected)
Beispiel #50
0
def test_from_tuples_empty():
    # GH 16777
    result = MultiIndex.from_tuples([], names=["a", "b"])
    expected = MultiIndex.from_arrays(arrays=[[], []], names=["a", "b"])
    tm.assert_index_equal(result, expected)
Beispiel #51
0
    def _groupby_reduce(self, ops=None):
        columns = self._df._get_columns()
        dtypes = self._df._frame.dtypes

        valid_ops = {}
        valid_columns = []

        size_reduction = len(ops) == 1 and ops[0][0] == "size"
        # If the ops are given as a list, apply them across all the columns
        # with compatible data types
        if isinstance(ops, list):
            key_indices = set(self._keys)
            for col_idx, col in enumerate(columns):
                if col_idx in key_indices:
                    continue

                if reduction.incompatible_ops(ops, dtypes[col_idx]):
                    continue

                valid_ops[col_idx] = [desc[0] for desc in ops]
                valid_columns.append(col)
                # Special case with the size reduction, which produces a single
                # output regardless of the number of input columns
                if size_reduction:
                    break

        # If the ops are passed in a dictionary, it also specifies the input
        # columns on which the aggregation are performed
        else:
            assert is_dict_like(ops)
            for col, descs in ops.items():
                col_idx = columns.get_indexer_for([col])
                if len(col_idx) > 1:
                    raise KeyError(f"ambiguous column name {col}")
                if col_idx[0] == -1:
                    raise KeyError(col)

                if reduction.incompatible_ops(descs, dtypes[col_idx[0]]):
                    continue

                valid_ops[col_idx[0]] = [desc[0] for desc in descs]
                valid_columns.append(col)

        frame = self._df._frame.groupby_reduce(self._keys, valid_ops,
                                               self._method, self._sort)

        # If more than one aggregation is requested for a column,
        # the output column names should use MultiIndex
        multi_aggs = any(len(set(ops)) > 1 for ops in valid_ops.values())

        def _generate_columns(columns, all_ops):
            if multi_aggs:
                from pandas import MultiIndex

                pairs = []
                for idx, ops in all_ops.items():
                    pairs.extend([(columns[idx], op) for op in ops])
                index = MultiIndex.from_tuples(pairs)

                if self._is_series_groupby:
                    index = index.droplevel(0)

                return index

            else:
                from pandas import Index

                return Index([columns[idx] for idx in all_ops.keys()])

        from .dataframe import DataFrame

        if self._as_index:
            # Groupby keys are rearranged to come first in the frame,
            # no matter where they were in the input frame, so the
            # indexer should be picking the first N keys in the frame,
            # where N is the number of keys
            indexer = list(range(len(self._keys)))
            index_columns = frame.select_columns(indexer)

            # However, the index names are derived from the input
            # dataframe, which is not rearranged, so we use the original
            # indexer to select the names
            index_names = columns[self._keys]

            value_names = _generate_columns(columns, valid_ops)

            # Once we find the index columns, we drop them from the frame
            frame = frame.drop_columns(indexer)
            frame = frame.set_index(index_columns, index_names)

            if size_reduction or (self._is_series_groupby and not multi_aggs):
                # Size reduction always produces a series
                from .series import Series

                return Series(frame=frame, name=value_names[0])
            else:
                return DataFrame(frame=frame, columns=value_names)

        else:
            # Index levels don't survive in the output when as_index is False
            levels = set(self._levels)
            keys = [key for key in self._keys if key not in levels]

            key_names = columns[keys]
            value_names = _generate_columns(columns, valid_ops)

            # If the column names are stored in a MultiIndex,
            # we should extend the key names to match the shape
            if multi_aggs:
                from pandas import MultiIndex

                key_names = MultiIndex.from_arrays(
                    [key_names, [""] * len(key_names)])

            value_names = key_names.append(value_names)
            frame = frame.drop_columns(self._levels)
            return DataFrame(frame=frame, columns=value_names)
Beispiel #52
0
            Categorical(pd.date_range("2018-06-01 00", freq="1T", periods=3)),
        ],
        names=["key1", "key2"],
    )
    expected = DataFrame({"values": [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx)
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
    "as_index, expected",
    [
        (
            True,
            Series(
                index=MultiIndex.from_arrays(
                    [Series([1, 1, 2], dtype="category"), [1, 2, 2]], names=["a", "b"]
                ),
                data=[1, 2, 3],
                name="x",
            ),
        ),
        (
            False,
            DataFrame(
                {
                    "a": Series([1, 1, 2], dtype="category"),
                    "b": [1, 2, 2],
                    "x": [1, 2, 3],
                }
            ),
        ),
Beispiel #53
0
print(data.unstack())  # unstack之后就是一个dataframe
print(data.unstack().stack())  # stack之后是一个series
print('DataFrame的层次索引')
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns = [['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
print(frame)
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
print(frame)
print(frame.ix['a', 1])   # a表示key1,1表示key2,这条语句表示key1为a,key2为1的那一行
print(frame.ix['a', 2]['Colorado'])  # 第一个[]表示行索引index,第二个[]表示列索引column
print(frame.ix['a', 2]['Ohio']['Red'])
print(frame.ix['a',2]['Ohio','Red'])
print('直接用MultiIndex创建层次索引结构')
print(MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Gree', 'Red', 'Green']],
                             names = ['state', 'color']))   # 这里索引的先后顺序会按照字母大小来排列

# reordering_and_sorting_levels 重新分级顺序
print('索引层级交换')
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns = [['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
frame.index.names = ['key1', 'key2']
frame_swapped = frame.swaplevel('key1', 'key2')
print(frame_swapped)
print(frame_swapped.swaplevel(0, 1))
print('根据索引排序')
# print(frame.sortlevel('key2'))
# print(frame.swaplevel(0, 1).sortlevel(0))
print(frame.sort_index(level= 'key2'))   # 根据key2排序
print(frame.swaplevel(0, 1).sort_index(level = 0))  # 交换index 0和1,再对index为0的进行排序
Beispiel #54
0
    def _to_narrow(self, terms, data, mask, dates, assets):
        """
        Convert raw computed pipeline results into a DataFrame for public APIs.

        Parameters
        ----------
        terms : dict[str -> Term]
            Dict mapping column names to terms.
        data : dict[str -> ndarray[ndim=2]]
            Dict mapping column names to computed results for those names.
        mask : ndarray[bool, ndim=2]
            Mask array of values to keep.
        dates : ndarray[datetime64, ndim=1]
            Row index for arrays `data` and `mask`
        assets : ndarray[int64, ndim=2]
            Column index for arrays `data` and `mask`

        Returns
        -------
        results : pd.DataFrame
            The indices of `results` are as follows:

            index : two-tiered MultiIndex of (date, asset).
                Contains an entry for each (date, asset) pair corresponding to
                a `True` value in `mask`.
            columns : Index of str
                One column per entry in `data`.

        If mask[date, asset] is True, then result.loc[(date, asset), colname]
        will contain the value of data[colname][date, asset].
        """
        if not mask.any():
            # Manually handle the empty DataFrame case. This is a workaround
            # to pandas failing to tz_localize an empty dataframe with a
            # MultiIndex. It also saves us the work of applying a known-empty
            # mask to each array.
            #
            # Slicing `dates` here to preserve pandas metadata.
            empty_dates = dates[:0]
            empty_assets = array([], dtype=object)
            return DataFrame(
                data={
                    name: array([], dtype=arr.dtype)
                    for name, arr in iteritems(data)
                },
                index=MultiIndex.from_arrays([empty_dates, empty_assets]),
            )

        resolved_assets = array(self._finder.retrieve_all(assets))
        dates_kept = repeat_last_axis(dates.values, len(assets))[mask]
        assets_kept = repeat_first_axis(resolved_assets, len(dates))[mask]

        final_columns = {}
        for name in data:
            # Each term that computed an output has its postprocess method
            # called on the filtered result.
            #
            # As of Mon May 2 15:38:47 2016, we only use this to convert
            # LabelArrays into categoricals.
            final_columns[name] = terms[name].postprocess(data[name][mask])

        return DataFrame(
            data=final_columns,
            index=MultiIndex.from_arrays([dates_kept, assets_kept]),
        ).tz_localize('UTC', level=0)
Beispiel #55
0
    def test_empty_index_col_scenarios(self):
        data = 'x,y,z'

        # None, no index
        index_col, expected = None, DataFrame([], columns=list('xyz')),
        tm.assert_frame_equal(
            self.read_csv(StringIO(data), index_col=index_col), expected)

        # False, no index
        index_col, expected = False, DataFrame([], columns=list('xyz')),
        tm.assert_frame_equal(
            self.read_csv(StringIO(data), index_col=index_col), expected)

        # int, first column
        index_col, expected = 0, DataFrame([],
                                           columns=['y', 'z'],
                                           index=Index([], name='x'))
        tm.assert_frame_equal(
            self.read_csv(StringIO(data), index_col=index_col), expected)

        # int, not first column
        index_col, expected = 1, DataFrame([],
                                           columns=['x', 'z'],
                                           index=Index([], name='y'))
        tm.assert_frame_equal(
            self.read_csv(StringIO(data), index_col=index_col), expected)

        # str, first column
        index_col, expected = 'x', DataFrame([],
                                             columns=['y', 'z'],
                                             index=Index([], name='x'))
        tm.assert_frame_equal(
            self.read_csv(StringIO(data), index_col=index_col), expected)

        # str, not the first column
        index_col, expected = 'y', DataFrame([],
                                             columns=['x', 'z'],
                                             index=Index([], name='y'))
        tm.assert_frame_equal(
            self.read_csv(StringIO(data), index_col=index_col), expected)

        # list of int
        index_col, expected = [0, 1], DataFrame(
            [],
            columns=['z'],
            index=MultiIndex.from_arrays([[]] * 2, names=['x', 'y']))
        tm.assert_frame_equal(self.read_csv(StringIO(data),
                                            index_col=index_col),
                              expected,
                              check_index_type=False)

        # list of str
        index_col = ['x', 'y']
        expected = DataFrame([],
                             columns=['z'],
                             index=MultiIndex.from_arrays([[]] * 2,
                                                          names=['x', 'y']))
        tm.assert_frame_equal(self.read_csv(StringIO(data),
                                            index_col=index_col),
                              expected,
                              check_index_type=False)

        # list of int, reversed sequence
        index_col = [1, 0]
        expected = DataFrame([],
                             columns=['z'],
                             index=MultiIndex.from_arrays([[]] * 2,
                                                          names=['y', 'x']))
        tm.assert_frame_equal(self.read_csv(StringIO(data),
                                            index_col=index_col),
                              expected,
                              check_index_type=False)

        # list of str, reversed sequence
        index_col = ['y', 'x']
        expected = DataFrame([],
                             columns=['z'],
                             index=MultiIndex.from_arrays([[]] * 2,
                                                          names=['y', 'x']))
        tm.assert_frame_equal(self.read_csv(StringIO(data),
                                            index_col=index_col),
                              expected,
                              check_index_type=False)
Beispiel #56
0
    def test_reset_index_datetime(self, tz_naive_fixture):
        # GH#3950
        tz = tz_naive_fixture
        idx1 = pd.date_range("1/1/2011",
                             periods=5,
                             freq="D",
                             tz=tz,
                             name="idx1")
        idx2 = Index(range(5), name="idx2", dtype="int64")
        idx = MultiIndex.from_arrays([idx1, idx2])
        df = DataFrame(
            {
                "a": np.arange(5, dtype="int64"),
                "b": ["A", "B", "C", "D", "E"]
            },
            index=idx,
        )

        expected = DataFrame(
            {
                "idx1": [
                    datetime(2011, 1, 1),
                    datetime(2011, 1, 2),
                    datetime(2011, 1, 3),
                    datetime(2011, 1, 4),
                    datetime(2011, 1, 5),
                ],
                "idx2":
                np.arange(5, dtype="int64"),
                "a":
                np.arange(5, dtype="int64"),
                "b": ["A", "B", "C", "D", "E"],
            },
            columns=["idx1", "idx2", "a", "b"],
        )
        expected["idx1"] = expected["idx1"].apply(
            lambda d: Timestamp(d, tz=tz))

        tm.assert_frame_equal(df.reset_index(), expected)

        idx3 = pd.date_range("1/1/2012",
                             periods=5,
                             freq="MS",
                             tz="Europe/Paris",
                             name="idx3")
        idx = MultiIndex.from_arrays([idx1, idx2, idx3])
        df = DataFrame(
            {
                "a": np.arange(5, dtype="int64"),
                "b": ["A", "B", "C", "D", "E"]
            },
            index=idx,
        )

        expected = DataFrame(
            {
                "idx1": [
                    datetime(2011, 1, 1),
                    datetime(2011, 1, 2),
                    datetime(2011, 1, 3),
                    datetime(2011, 1, 4),
                    datetime(2011, 1, 5),
                ],
                "idx2":
                np.arange(5, dtype="int64"),
                "idx3": [
                    datetime(2012, 1, 1),
                    datetime(2012, 2, 1),
                    datetime(2012, 3, 1),
                    datetime(2012, 4, 1),
                    datetime(2012, 5, 1),
                ],
                "a":
                np.arange(5, dtype="int64"),
                "b": ["A", "B", "C", "D", "E"],
            },
            columns=["idx1", "idx2", "idx3", "a", "b"],
        )
        expected["idx1"] = expected["idx1"].apply(
            lambda d: Timestamp(d, tz=tz))
        expected["idx3"] = expected["idx3"].apply(
            lambda d: Timestamp(d, tz="Europe/Paris"))
        tm.assert_frame_equal(df.reset_index(), expected)

        # GH#7793
        idx = MultiIndex.from_product([["a", "b"],
                                       pd.date_range("20130101",
                                                     periods=3,
                                                     tz=tz)])
        df = DataFrame(np.arange(6, dtype="int64").reshape(6, 1),
                       columns=["a"],
                       index=idx)

        expected = DataFrame(
            {
                "level_0":
                "a a a b b b".split(),
                "level_1": [
                    datetime(2013, 1, 1),
                    datetime(2013, 1, 2),
                    datetime(2013, 1, 3),
                ] * 2,
                "a":
                np.arange(6, dtype="int64"),
            },
            columns=["level_0", "level_1", "a"],
        )
        expected["level_1"] = expected["level_1"].apply(
            lambda d: Timestamp(d, freq="D", tz=tz))
        result = df.reset_index()
        tm.assert_frame_equal(result, expected)
Beispiel #57
0
    def test_tz_convert_and_localize(self, fn):
        l0 = date_range('20140701', periods=5, freq='D')
        l1 = date_range('20140701', periods=5, freq='D')

        int_idx = Index(range(5))

        if fn == 'tz_convert':
            l0 = l0.tz_localize('UTC')
            l1 = l1.tz_localize('UTC')

        for idx in [l0, l1]:

            l0_expected = getattr(idx, fn)('US/Pacific')
            l1_expected = getattr(idx, fn)('US/Pacific')

            df1 = DataFrame(np.ones(5), index=l0)
            df1 = getattr(df1, fn)('US/Pacific')
            assert_index_equal(df1.index, l0_expected)

            # MultiIndex
            # GH7846
            df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1]))

            df3 = getattr(df2, fn)('US/Pacific', level=0)
            assert not df3.index.levels[0].equals(l0)
            assert_index_equal(df3.index.levels[0], l0_expected)
            assert_index_equal(df3.index.levels[1], l1)
            assert not df3.index.levels[1].equals(l1_expected)

            df3 = getattr(df2, fn)('US/Pacific', level=1)
            assert_index_equal(df3.index.levels[0], l0)
            assert not df3.index.levels[0].equals(l0_expected)
            assert_index_equal(df3.index.levels[1], l1_expected)
            assert not df3.index.levels[1].equals(l1)

            df4 = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0]))

            # TODO: untested
            df5 = getattr(df4, fn)('US/Pacific', level=1)  # noqa

            assert_index_equal(df3.index.levels[0], l0)
            assert not df3.index.levels[0].equals(l0_expected)
            assert_index_equal(df3.index.levels[1], l1_expected)
            assert not df3.index.levels[1].equals(l1)

        # Bad Inputs

        # Not DatetimeIndex / PeriodIndex
        with pytest.raises(TypeError, match='DatetimeIndex'):
            df = DataFrame(index=int_idx)
            df = getattr(df, fn)('US/Pacific')

        # Not DatetimeIndex / PeriodIndex
        with pytest.raises(TypeError, match='DatetimeIndex'):
            df = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0]))
            df = getattr(df, fn)('US/Pacific', level=0)

        # Invalid level
        with pytest.raises(ValueError, match='not valid'):
            df = DataFrame(index=l0)
            df = getattr(df, fn)('US/Pacific', level=1)
def test_isin_nan_not_pypy():
    idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]])
    tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]),
                                np.array([False, False]))
    tm.assert_numpy_array_equal(idx.isin([("bar", float("nan"))]),
                                np.array([False, False]))
Beispiel #59
0
    def check_query_with_unnamed_multiindex(self, parser, engine):
        tm.skip_if_no_ne(engine)
        a = np.random.choice(['red', 'green'], size=10)
        b = np.random.choice(['eggs', 'ham'], size=10)
        index = MultiIndex.from_arrays([a, b])
        df = DataFrame(randn(10, 2), index=index)
        ind = Series(df.index.get_level_values(0).values, index=index)

        res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
        res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine)
        exp = df[ind == 'red']
        assert_frame_equal(res1, exp)
        assert_frame_equal(res2, exp)

        # inequality
        res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
        res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine)
        exp = df[ind != 'red']
        assert_frame_equal(res1, exp)
        assert_frame_equal(res2, exp)

        # list equality (really just set membership)
        res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine)
        res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine)
        exp = df[ind.isin(['red'])]
        assert_frame_equal(res1, exp)
        assert_frame_equal(res2, exp)

        res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine)
        res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine)
        exp = df[~ind.isin(['red'])]
        assert_frame_equal(res1, exp)
        assert_frame_equal(res2, exp)

        # in/not in ops
        res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine)
        res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine)
        exp = df[ind.isin(['red'])]
        assert_frame_equal(res1, exp)
        assert_frame_equal(res2, exp)

        res1 = df.query('["red"] not in ilevel_0',
                        parser=parser,
                        engine=engine)
        res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine)
        exp = df[~ind.isin(['red'])]
        assert_frame_equal(res1, exp)
        assert_frame_equal(res2, exp)

        # ## LEVEL 1
        ind = Series(df.index.get_level_values(1).values, index=index)
        res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine)
        res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine)
        exp = df[ind == 'eggs']
        assert_frame_equal(res1, exp)
        assert_frame_equal(res2, exp)

        # inequality
        res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine)
        res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine)
        exp = df[ind != 'eggs']
        assert_frame_equal(res1, exp)
        assert_frame_equal(res2, exp)

        # list equality (really just set membership)
        res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine)
        res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine)
        exp = df[ind.isin(['eggs'])]
        assert_frame_equal(res1, exp)
        assert_frame_equal(res2, exp)

        res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine)
        res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine)
        exp = df[~ind.isin(['eggs'])]
        assert_frame_equal(res1, exp)
        assert_frame_equal(res2, exp)

        # in/not in ops
        res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine)
        res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine)
        exp = df[ind.isin(['eggs'])]
        assert_frame_equal(res1, exp)
        assert_frame_equal(res2, exp)

        res1 = df.query('["eggs"] not in ilevel_1',
                        parser=parser,
                        engine=engine)
        res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine)
        exp = df[~ind.isin(['eggs'])]
        assert_frame_equal(res1, exp)
        assert_frame_equal(res2, exp)
Beispiel #60
0
def test_from_arrays_invalid_input(invalid_sequence_of_arrays):
    msg = "Input must be a list / sequence of array-likes"
    with pytest.raises(TypeError, match=msg):
        MultiIndex.from_arrays(arrays=invalid_sequence_of_arrays)