Example #1
0
    def test_apply_categorical_data(self):
        # GH 10138
        for ordered in [True, False]:
            dense = Categorical(list("abc"), ordered=ordered)
            # 'b' is in the categories but not in the list
            missing = Categorical(list("aaa"), categories=["a", "b"], ordered=ordered)
            values = np.arange(len(dense))
            df = DataFrame({"missing": missing, "dense": dense, "values": values})
            grouped = df.groupby(["missing", "dense"])

            # missing category 'b' should still exist in the output index
            idx = MultiIndex.from_product(
                [Categorical(["a", "b"], ordered=ordered), Categorical(["a", "b", "c"], ordered=ordered)],
                names=["missing", "dense"],
            )
            expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], index=idx, columns=["values"])

            assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected)
            assert_frame_equal(grouped.mean(), expected)
            assert_frame_equal(grouped.agg(np.mean), expected)

            # but for transform we should still get back the original index
            idx = MultiIndex.from_product([["a"], ["a", "b", "c"]], names=["missing", "dense"])
            expected = Series(1, index=idx)
            assert_series_equal(grouped.apply(lambda x: 1), expected)
Example #2
0
    def _wrap_applied_output(self, keys, values, not_indexed_same=False):
        if len(keys) == 0:
            return Series([])

        key_names = [ping.name for ping in self.groupings]

        if isinstance(values[0], Series):
            if not_indexed_same:
                data_dict = dict(zip(keys, values))
                result = DataFrame(data_dict).T
                if len(self.groupings) > 1:
                    result.index = MultiIndex.from_tuples(keys, names=key_names)
                return result
            else:
                cat_values = np.concatenate([x.values for x in values])
                cat_index = values[0].index
                if len(values) > 1:
                    cat_index = cat_index.append([x.index for x in values[1:]])
                return Series(cat_values, index=cat_index)
        elif isinstance(values[0], DataFrame):
            # possible that Series -> DataFrame by applied function
            return self._wrap_frames(keys, values,
                                     not_indexed_same=not_indexed_same)
        else:
            if len(self.groupings) > 1:
                index = MultiIndex.from_tuples(keys, names=key_names)
                return Series(values, index)
            else:
                return Series(values, keys)
Example #3
0
    def test_apply_categorical_data(self):
        # GH 10138
        for ordered in [True, False]:
            dense = Categorical(list('abc'), ordered=ordered)
            # 'b' is in the categories but not in the list
            missing = Categorical(
                list('aaa'), categories=['a', 'b'], ordered=ordered)
            values = np.arange(len(dense))
            df = DataFrame({'missing': missing,
                            'dense': dense,
                            'values': values})
            grouped = df.groupby(['missing', 'dense'])

            # missing category 'b' should still exist in the output index
            idx = MultiIndex.from_product(
                [Categorical(['a', 'b'], ordered=ordered),
                 Categorical(['a', 'b', 'c'], ordered=ordered)],
                names=['missing', 'dense'])
            expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan],
                                 index=idx,
                                 columns=['values'])

            assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected)
            assert_frame_equal(grouped.mean(), expected)
            assert_frame_equal(grouped.agg(np.mean), expected)

            # but for transform we should still get back the original index
            idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']],
                                          names=['missing', 'dense'])
            expected = Series(1, index=idx)
            assert_series_equal(grouped.apply(lambda x: 1), expected)
Example #4
0
    def test_reset_index(self):
        df = tm.makeDataFrame()[:5]
        ser = df.stack()
        ser.index.names = ["hash", "category"]

        ser.name = "value"
        df = ser.reset_index()
        self.assertIn("value", df)

        df = ser.reset_index(name="value2")
        self.assertIn("value2", df)

        # check inplace
        s = ser.reset_index(drop=True)
        s2 = ser
        s2.reset_index(drop=True, inplace=True)
        assert_series_equal(s, s2)

        # level
        index = MultiIndex(
            levels=[["bar"], ["one", "two", "three"], [0, 1]],
            labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]],
        )
        s = Series(np.random.randn(6), index=index)
        rs = s.reset_index(level=1)
        self.assertEqual(len(rs.columns), 2)

        rs = s.reset_index(level=[0, 2], drop=True)
        self.assertTrue(rs.index.equals(Index(index.get_level_values(1))))
        tm.assertIsInstance(rs, Series)
Example #5
0
    def test_reset_index(self):
        df = tm.makeDataFrame()[:5]
        ser = df.stack()
        ser.index.names = ['hash', 'category']

        ser.name = 'value'
        df = ser.reset_index()
        self.assertIn('value', df)

        df = ser.reset_index(name='value2')
        self.assertIn('value2', df)

        # check inplace
        s = ser.reset_index(drop=True)
        s2 = ser
        s2.reset_index(drop=True, inplace=True)
        assert_series_equal(s, s2)

        # level
        index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
                           labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2],
                                   [0, 1, 0, 1, 0, 1]])
        s = Series(np.random.randn(6), index=index)
        rs = s.reset_index(level=1)
        self.assertEqual(len(rs.columns), 2)

        rs = s.reset_index(level=[0, 2], drop=True)
        self.assert_index_equal(rs.index, Index(index.get_level_values(1)))
        tm.assertIsInstance(rs, Series)
Example #6
0
def _coo_to_sparse_series(A, dense_index=False):
    """ Convert a scipy.sparse.coo_matrix to a SparseSeries.
    Use the defaults given in the SparseSeries constructor. """
    s = Series(A.data, MultiIndex.from_arrays((A.row, A.col)))
    s = s.sort_index()
    s = s.to_sparse()  # TODO: specify kind?
    if dense_index:
        # is there a better constructor method to use here?
        i = range(A.shape[0])
        j = range(A.shape[1])
        ind = MultiIndex.from_product([i, j])
        s = s.reindex_axis(ind)
    return s
Example #7
0
    def test_droplevel_with_names(self):
        index = self.index[self.index.get_loc('foo')]
        dropped = index.droplevel(0)
        self.assertEqual(dropped.name, 'second')

        index = MultiIndex(levels=[Index(range(4)),
                                   Index(range(4)),
                                   Index(range(4))],
                           labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]),
                                   np.array([0, 1, 0, 0, 0, 1, 0, 1]),
                                   np.array([1, 0, 1, 1, 0, 0, 1, 0])],
                           names=['one', 'two', 'three'])
        dropped = index.droplevel(0)
        self.assertEqual(dropped.names, ['two', 'three'])
Example #8
0
    def test_slice_locs_not_sorted(self):
        index = MultiIndex(levels=[Index(range(4)),
                                   Index(range(4)),
                                   Index(range(4))],
                           labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]),
                                   np.array([0, 1, 0, 0, 0, 1, 0, 1]),
                                   np.array([1, 0, 1, 1, 0, 0, 1, 0])])

        self.assertRaises(Exception, index.slice_locs, (1, 0, 1),
                          (2, 1, 0))

        # works
        sorted_index, _ = index.sortlevel(0)
        result = sorted_index.slice_locs((1, 0, 1), (2, 1, 0))
Example #9
0
    def test_get_loc(self):
        self.assert_(self.index.get_loc(('foo', 'two')) == 1)
        self.assert_(self.index.get_loc(('baz', 'two')) == 3)
        self.assertRaises(KeyError, self.index.get_loc, ('bar', 'two'))
        self.assertRaises(KeyError, self.index.get_loc, 'quux')

        # 3 levels
        index = MultiIndex(levels=[Index(range(4)),
                                   Index(range(4)),
                                   Index(range(4))],
                           labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]),
                                   np.array([0, 1, 0, 0, 0, 1, 0, 1]),
                                   np.array([1, 0, 1, 1, 0, 0, 1, 0])])
        self.assertRaises(KeyError, index.get_loc, (1, 1))
        self.assert_(index.get_loc((2, 0)) == slice(3, 5))
Example #10
0
    def test_alignment(self):
        x = Series(data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)]))

        y = Series(data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)]))

        res = x - y
        exp_index = x.index.union(y.index)
        exp = x.reindex(exp_index) - y.reindex(exp_index)
        assert_series_equal(res, exp)

        # hit non-monotonic code path
        res = x[::-1] - y[::-1]
        exp_index = x.index.union(y.index)
        exp = x.reindex(exp_index) - y.reindex(exp_index)
        assert_series_equal(res, exp)
Example #11
0
 def test_dataframe_insert_column_all_na(self):
     # GH #1534
     mix = MultiIndex.from_tuples([('1a', '2a'), ('1a', '2b'), ('1a', '2c')])
     df = DataFrame([[1,2],[3,4],[5,6]], index=mix)
     s = Series({(1,1): 1, (1,2): 2})
     df['new'] = s
     self.assert_(df['new'].isnull().all())
Example #12
0
    def _wrap_result_expand(self, result, expand=False):
        if not isinstance(expand, bool):
            raise ValueError("expand must be True or False")

        from pandas.core.index import Index, MultiIndex
        if not hasattr(result, 'ndim'):
            return result

        if isinstance(self.series, Index):
            name = getattr(result, 'name', None)
            # if result is a boolean np.array, return the np.array
            # instead of wrapping it into a boolean Index (GH 8875)
            if hasattr(result, 'dtype') and is_bool_dtype(result):
                return result

            if expand:
                result = list(result)
                return MultiIndex.from_tuples(result, names=name)
            else:
                return Index(result, name=name)
        else:
            index = self.series.index
            if expand:
                def cons_row(x):
                    if is_list_like(x):
                        return x
                    else:
                        return [ x ]
                cons = self.series._constructor_expanddim
                data = [cons_row(x) for x in result]
                return cons(data, index=index)
            else:
                name = getattr(result, 'name', None)
                cons = self.series._constructor
                return cons(result, name=name, index=index)
Example #13
0
    def _wrap_applied_output(self, keys, values, not_indexed_same=False):
        if len(keys) == 0:
            # XXX
            return DataFrame({})

        key_names = [ping.name for ping in self.groupings]

        if isinstance(values[0], DataFrame):
            return self._wrap_frames(keys, values,
                                     not_indexed_same=not_indexed_same)
        else:
            if len(self.groupings) > 1:
                keys = MultiIndex.from_tuples(keys, names=key_names)

            if isinstance(values[0], np.ndarray):
                if self.axis == 0:
                    stacked_values = np.vstack([np.asarray(x)
                                                for x in values])
                    columns = values[0].index
                    index = keys
                else:
                    stacked_values = np.vstack([np.asarray(x)
                                                for x in values]).T
                    index = values[0].index
                    columns = keys
                return DataFrame(stacked_values, index=index,
                                 columns=columns)
            else:
                return Series(values, index=keys)
Example #14
0
 def test_delevel_infer_dtype(self):
     tuples = [tuple for tuple in cart_product(["foo", "bar"], [10, 20], [1.0, 1.1])]
     index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"])
     df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index)
     deleveled = df.reset_index()
     self.assert_(com.is_integer_dtype(deleveled["prm1"]))
     self.assert_(com.is_float_dtype(deleveled["prm2"]))
Example #15
0
    def setUp(self):
        index = MultiIndex(
            levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
            labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
            names=["first", "second"],
        )
        self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp"))

        self.single_level = MultiIndex(levels=[["foo", "bar", "baz", "qux"]], labels=[[0, 1, 2, 3]], names=["first"])

        # create test series object
        arrays = [
            ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"],
            ["one", "two", "one", "two", "one", "two", "one", "two"],
        ]
        tuples = zip(*arrays)
        index = MultiIndex.from_tuples(tuples)
        s = Series(randn(8), index=index)
        s[3] = np.NaN
        self.series = s

        tm.N = 100
        self.tdf = tm.makeTimeDataFrame()
        self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum()

        # use Int64Index, to make sure things work
        self.ymd.index.levels = [lev.astype("i8") for lev in self.ymd.index.levels]
        self.ymd.index.names = ["year", "month", "day"]
Example #16
0
    def test_mixed_depth_pop(self):
        arrays = [
            ["a", "top", "top", "routine1", "routine1", "routine2"],
            ["", "OD", "OD", "result1", "result2", "result1"],
            ["", "wx", "wy", "", "", ""],
        ]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4, 6), columns=index)

        df1 = df.copy()
        df2 = df.copy()
        result = df1.pop("a")
        expected = df2.pop(("a", "", ""))
        assert_series_equal(expected, result)
        assert_frame_equal(df1, df2)
        self.assertEquals(result.name, "a")

        expected = df1["top"]
        df1 = df1.drop(["top"], axis=1)
        result = df2.pop("top")
        assert_frame_equal(expected, result)
        assert_frame_equal(df1, df2)
Example #17
0
    def test_agg_compat(self):

        # GH 12334

        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'two',
                              'two', 'two', 'one', 'two'],
                        'C': np.random.randn(8) + 1.0,
                        'D': np.arange(8)})

        g = df.groupby(['A', 'B'])

        expected = pd.concat([g['D'].sum(),
                              g['D'].std()],
                             axis=1)
        expected.columns = MultiIndex.from_tuples([('C', 'sum'),
                                                   ('C', 'std')])
        result = g['D'].agg({'C': ['sum', 'std']})
        assert_frame_equal(result, expected, check_like=True)

        expected = pd.concat([g['D'].sum(),
                              g['D'].std()],
                             axis=1)
        expected.columns = ['C', 'D']
        result = g['D'].agg({'C': 'sum', 'D': 'std'})
        assert_frame_equal(result, expected, check_like=True)
Example #18
0
    def test_mixed_depth_drop(self):
        arrays = [[  'a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
                  [   '',  'OD',  'OD', 'result1',   'result2',  'result1'],
                  [   '',  'wx',  'wy',        '',          '',         '']]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4,6),columns = index)

        result = df.drop('a',axis=1)
        expected = df.drop([('a','','')],axis=1)
        assert_frame_equal(expected, result)

        result = df.drop(['top'],axis=1)
        expected = df.drop([('top','OD','wx')], axis=1)
        expected = expected.drop([('top','OD','wy')], axis=1)
        assert_frame_equal(expected, result)

        result = df.drop(('top', 'OD', 'wx'), axis=1)
        expected = df.drop([('top','OD','wx')], axis=1)
        assert_frame_equal(expected, result)

        expected = df.drop([('top','OD','wy')], axis=1)
        expected = df.drop('top', axis=1)

        result = df.drop('result1', level=1, axis=1)
        expected = df.drop([('routine1', 'result1', ''),
                            ('routine2', 'result1', '')], axis=1)
        assert_frame_equal(expected, result)
Example #19
0
def pivot_simple(index, columns, values):
    """
    Produce 'pivot' table based on 3 columns of this DataFrame.
    Uses unique values from index / columns and fills with values.

    Parameters
    ----------
    index : ndarray
        Labels to use to make new frame's index
    columns : ndarray
        Labels to use to make new frame's columns
    values : ndarray
        Values to use for populating new frame's values

    Note
    ----
    Obviously, all 3 of the input arguments must have the same length

    Returns
    -------
    DataFrame
    """
    if (len(index) != len(columns)) or (len(columns) != len(values)):
        raise AssertionError('Length of index, columns, and values must be the'
                             ' same')

    if len(index) == 0:
        return DataFrame(index=[])

    hindex = MultiIndex.from_arrays([index, columns])
    series = Series(values.ravel(), index=hindex)
    series = series.sortlevel(0)
    return series.unstack()
Example #20
0
def pivot(self, index=None, columns=None, values=None):
    """
    See DataFrame.pivot
    """
    index_vals = self[index]
    column_vals = self[columns]
    mindex = MultiIndex.from_arrays([index_vals, column_vals])
    try:
        mindex._verify_integrity()
    except Exception:
        raise Exception("duplicate index/column pairs!")

    if values is None:
        items = self.columns - [index, columns]
        mat = self.reindex(columns=items).values
    else:
        items = [values]
        mat = np.atleast_2d(self[values].values).T

    stacked = DataFrame(mat, index=mindex, columns=items)

    if not mindex.is_lexsorted():
        stacked = stacked.sortlevel(level=0)

    unstacked = stacked.unstack()
    if values is not None:
        unstacked.columns = unstacked.columns.droplevel(0)
    return unstacked
Example #21
0
    def test_constructor(self):
        self.assertTrue(self.ts.index.is_all_dates)

        # Pass in Series
        derived = Series(self.ts)
        self.assertTrue(derived.index.is_all_dates)

        self.assertTrue(tm.equalContents(derived.index, self.ts.index))
        # Ensure new index is not created
        self.assertEqual(id(self.ts.index), id(derived.index))

        # Mixed type Series
        mixed = Series(['hello', np.NaN], index=[0, 1])
        self.assertEqual(mixed.dtype, np.object_)
        self.assertIs(mixed[1], np.NaN)

        self.assertFalse(self.empty.index.is_all_dates)
        self.assertFalse(Series({}).index.is_all_dates)
        self.assertRaises(Exception, Series, np.random.randn(3, 3),
                          index=np.arange(3))

        mixed.name = 'Series'
        rs = Series(mixed).name
        xp = 'Series'
        self.assertEqual(rs, xp)

        # raise on MultiIndex GH4187
        m = MultiIndex.from_arrays([[1, 2], [3, 4]])
        self.assertRaises(NotImplementedError, Series, m)
Example #22
0
 def _agg_index(self, index, try_parse_dates=True):
     if np.isscalar(self.index_col):
         if try_parse_dates and self._should_parse_dates(self.index_col):
             index = self._conv_date(index)
         na_values = self.na_values
         if isinstance(na_values, dict):
             na_values = _get_na_values(self.index_name, na_values)
         index, na_count = _convert_types(index, na_values)
         index = Index(index, name=self.index_name)
         if self.verbose and na_count:
             print 'Found %d NA values in the index' % na_count
     else:
         arrays = []
         for i, arr in enumerate(index):
             if (try_parse_dates and
                 self._should_parse_dates(self.index_col[i])):
                 arr = self._conv_date(arr)
             col_na_values = self.na_values
             if isinstance(self.na_values, dict):
                 col_name = self.index_name[i]
                 if col_name is not None:
                     col_na_values = _get_na_values(col_name,
                                                    self.na_values)
             arr, _ = _convert_types(arr, col_na_values)
             arrays.append(arr)
         index = MultiIndex.from_arrays(arrays, names=self.index_name)
     return index
Example #23
0
    def test_from_arrays(self):
        arrays = []
        for lev, lab in zip(self.index.levels, self.index.labels):
            arrays.append(np.asarray(lev).take(lab))

        result = MultiIndex.from_arrays(arrays)
        self.assertEquals(list(result), list(self.index))
Example #24
0
    def _wrap_aggregated_output(self, output, mask, comp_ids):
        agg_axis = 0 if self.axis == 1 else 1
        agg_labels = self._obj_with_exclusions._get_axis(agg_axis)

        if len(output) == len(agg_labels):
            output_keys = agg_labels
        else:
            output_keys = sorted(output)
            try:
                output_keys.sort()
            except Exception:  # pragma: no cover
                pass

            if isinstance(agg_labels, MultiIndex):
                output_keys = MultiIndex.from_tuples(output_keys,
                                                     names=agg_labels.names)

        if not self.as_index:
            result = DataFrame(output, columns=output_keys)
            group_levels = self._get_group_levels(mask, comp_ids)
            for i, (name, labels) in enumerate(group_levels):
                result.insert(i, name, labels)
            result = result.consolidate()
        else:
            index = self._get_multi_index(mask, comp_ids)
            result = DataFrame(output, index=index, columns=output_keys)

        if self.axis == 1:
            result = result.T

        return result
Example #25
0
    def setUp(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        self.frame = DataFrame(np.random.randn(10, 3), index=index,
                               columns=Index(['A', 'B', 'C'], name='exp'))

        self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
                                       labels=[[0, 1, 2, 3]],
                                       names=['first'])

        # create test series object
        arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'],
                  ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
        tuples = zip(*arrays)
        index = MultiIndex.from_tuples(tuples)
        s = Series(randn(8), index=index)
        s[3] = np.NaN
        self.series = s

        tm.N = 100
        self.tdf = tm.makeTimeDataFrame()
        self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month,
                                     lambda x: x.day]).sum()

        # use Int64Index, to make sure things work
        self.ymd.index.levels = [lev.astype('i8')
                                 for lev in self.ymd.index.levels]
        self.ymd.index.names = ['year', 'month', 'day']
Example #26
0
def pivot(self, index=None, columns=None, values=None):
    """
    See DataFrame.pivot
    """
    index_vals = self[index]
    column_vals = self[columns]
    mindex = MultiIndex.from_arrays([index_vals, column_vals],
                                    names=[index, columns])

    if values is None:
        items = self.columns - [index, columns]
        mat = self.reindex(columns=items).values
    else:
        items = [values]
        mat = np.atleast_2d(self[values].values).T

    stacked = DataFrame(mat, index=mindex, columns=items)

    if not mindex.is_lexsorted():
        stacked = stacked.sortlevel(level=0)

    unstacked = stacked.unstack()
    if values is not None:
        unstacked.columns = unstacked.columns.droplevel(0)
    return unstacked
Example #27
0
    def test_get_loc(self):
        self.assert_(self.index.get_loc(("foo", "two")) == 1)
        self.assert_(self.index.get_loc(("baz", "two")) == 3)
        self.assertRaises(KeyError, self.index.get_loc, ("bar", "two"))
        self.assertRaises(KeyError, self.index.get_loc, "quux")

        # 3 levels
        index = MultiIndex(
            levels=[Index(range(4)), Index(range(4)), Index(range(4))],
            labels=[
                np.array([0, 0, 1, 2, 2, 2, 3, 3]),
                np.array([0, 1, 0, 0, 0, 1, 0, 1]),
                np.array([1, 0, 1, 1, 0, 0, 1, 0]),
            ],
        )
        self.assertRaises(KeyError, index.get_loc, (1, 1))
        self.assert_(index.get_loc((2, 0)) == slice(3, 5))
Example #28
0
    def test_repr_name_coincide(self):
        index = MultiIndex.from_tuples([('a', 0, 'foo'), ('b', 1, 'bar')],
                                       names=['a', 'b', 'c'])

        df = DataFrame({'value': [0, 1]}, index=index)

        lines = repr(df).split('\n')
        self.assert_(lines[2].startswith('a 0 foo'))
Example #29
0
    def test_no_multiindexes(self):
        """
        Only teardown single indexes.
        """
        from pandas.core.index import MultiIndex

        index = MultiIndex.from_tuples([(0,1), (1,2)])
        self.assertRaises(TypeError, teardown_index, index)
Example #30
0
    def test_drop(self):
        dropped = self.index.drop([('foo', 'two'), ('qux', 'one')])

        index = MultiIndex.from_tuples([('foo', 'two'), ('qux', 'one')])
        dropped2 = self.index.drop(index)

        expected = self.index[[0, 2, 3, 5]]
        self.assert_(dropped.equals(expected))
        self.assert_(dropped2.equals(expected))

        dropped = self.index.drop(['bar'])
        expected = self.index[[0, 1, 3, 4, 5]]
        self.assert_(dropped.equals(expected))

        index = MultiIndex.from_tuples([('bar', 'two')])
        self.assertRaises(Exception, self.index.drop, [('bar', 'two')])
        self.assertRaises(Exception, self.index.drop, index)
Example #31
0
    def _reshape_tables(self):
        """
        TODO _reshape_tables should be cleaned !!!
        """
        dfs = self.aggregates_of_dataframe
        dfs_erf = self.aggregates_erfs_dataframe
        labels_variables = self.labels_variables

        agg = Aggregates()

        # We need this for the columns labels to work

        print 'Resetting index to avoid later trouble on manipulation'
        for d in dfs:
            d.reset_index(inplace=True)
            d.set_index('Mesure', inplace=True, drop=False)
            d.reindex_axis(labels_variables, axis=0)
            d.reset_index(inplace=True, drop=True)
#             print d.to_string()
        for d in dfs_erf:
            d.reset_index(inplace=True)
            d['Mesure'] = agg.labels['dep']
            d.set_index('index', inplace=True, drop=False)
            d.reindex_axis(agg.labels.values(), axis=0)
            d.reset_index(inplace=True, drop=True)
#             print d.to_string()

# Concatening the openfisca tables for =/= years

        temp = dfs[0]
        if len(dfs) != 1:
            for d in dfs[1:]:
                temp = pd.concat([temp, d], ignore_index=True)

        del temp[agg.labels['entity']], temp['index']
        gc.collect()

        print 'We split the real aggregates from the of table'
        temp2 = temp[[
            agg.labels['var'], agg.labels['benef_real'],
            agg.labels['dep_real'], 'year'
        ]]
        del temp[agg.labels['benef_real']], temp[agg.labels['dep_real']]
        sauvegarde = temp.columns.get_level_values(0).unique()
        temp['source'] = 'of'
        temp2['source'] = 'reel'
        temp2.rename(columns={
            agg.labels['benef_real']: agg.labels['benef'],
            agg.labels['dep_real']: agg.labels['dep']
        },
                     inplace=True)
        temp = pd.concat([temp, temp2], ignore_index=True)

        print 'We add the erf data to the table'
        for df in dfs_erf:
            del df['level_0'], df['Mesure']
            df.rename(columns={
                'index': agg.labels['var'],
                1: agg.labels['dep']
            },
                      inplace=True)

        temp3 = dfs_erf[0]
        if len(dfs) != 1:
            for d3 in dfs_erf[1:]:
                temp3 = pd.concat([temp3, d3], ignore_index=True)

        temp3['source'] = 'erfs'
        gc.collect()
        temp = pd.concat([temp, temp3], ignore_index=True)
        #         print temp.to_string()

        print 'Index manipulation to reshape the output'
        temp.reset_index(drop=True, inplace=True)
        # We set the new index
        #         temp.set_index('Mesure', drop = True, inplace = True)
        #         temp.set_index('source', drop = True, append = True, inplace = True)
        #         temp.set_index('year', drop = False, append = True, inplace = True)
        temp = temp.groupby(by=["Mesure", "source", "year"], sort=False).sum()
        # Tricky, the [mesure, source, year] index is unique so sum() will return the only value
        # Groupby automatically deleted the source, mesure... columns and added them to index
        assert (isinstance(temp, pd.DataFrame))
        #         print temp.to_string()

        # We want the years to be in columns, so we use unstack
        temp_unstacked = temp.unstack()
        # Unfortunately, unstack automatically sorts rows and columns, we have to reindex the table :

        ## Reindexing rows
        from pandas.core.index import MultiIndex
        indtemp1 = temp.index.get_level_values(0)
        indtemp2 = temp.index.get_level_values(1)
        indexi = zip(*[indtemp1, indtemp2])
        indexi_bis = []
        for i in xrange(0, len(indexi)):
            if indexi[i] not in indexi_bis:
                indexi_bis.append(indexi[i])
        indexi = indexi_bis
        del indexi_bis
        indexi = MultiIndex.from_tuples(indexi, names=['Mesure', 'source'])
        #         import pdb
        #         pdb.set_trace()
        temp_unstacked = temp_unstacked.reindex_axis(
            indexi, axis=0)  # axis = 0 for rows, 1 for columns

        ## Reindexing columns
        # TODO : still not working
        col_indexi = []
        print temp.columns
        for i in xrange(len(sauvegarde)):
            #         for col in temp.columns.get_level_values(0).unique():
            col = sauvegarde[i]
            for yr in self.years:
                col_indexi.append((col, yr))
        col_indexi = MultiIndex.from_tuples(col_indexi)
        #         print col_indexi
        #         print temp_unstacked.columns
        print col_indexi
        print temp_unstacked.columns
        temp_unstacked = temp_unstacked.reindex_axis(col_indexi, axis=1)

        # Our table is ready to be turned to Excel worksheet !
        #         print temp_unstacked.to_string()
        del temp_unstacked['Mesure'], temp_unstacked['year']
        temp_unstacked.fillna(0, inplace=True)
        return temp_unstacked
Example #32
0
def _unstack_multiple(data, clocs, fill_value=None):
    if len(clocs) == 0:
        return data

    # NOTE: This doesn't deal with hierarchical columns yet

    index = data.index

    clocs = [index._get_level_number(i) for i in clocs]

    rlocs = [i for i in range(index.nlevels) if i not in clocs]

    clevels = [index.levels[i] for i in clocs]
    ccodes = [index.codes[i] for i in clocs]
    cnames = [index.names[i] for i in clocs]
    rlevels = [index.levels[i] for i in rlocs]
    rcodes = [index.codes[i] for i in rlocs]
    rnames = [index.names[i] for i in rlocs]

    shape = [len(x) for x in clevels]
    group_index = get_group_index(ccodes, shape, sort=False, xnull=False)

    comp_ids, obs_ids = compress_group_index(group_index, sort=False)
    recons_codes = decons_obs_group_ids(comp_ids,
                                        obs_ids,
                                        shape,
                                        ccodes,
                                        xnull=False)

    if rlocs == []:
        # Everything is in clocs, so the dummy df has a regular index
        dummy_index = Index(obs_ids, name='__placeholder__')
    else:
        dummy_index = MultiIndex(levels=rlevels + [obs_ids],
                                 codes=rcodes + [comp_ids],
                                 names=rnames + ['__placeholder__'],
                                 verify_integrity=False)

    if isinstance(data, Series):
        dummy = data.copy()
        dummy.index = dummy_index

        unstacked = dummy.unstack('__placeholder__', fill_value=fill_value)
        new_levels = clevels
        new_names = cnames
        new_codes = recons_codes
    else:
        if isinstance(data.columns, MultiIndex):
            result = data
            for i in range(len(clocs)):
                val = clocs[i]
                result = result.unstack(val)
                clocs = [v if i > v else v - 1 for v in clocs]

            return result

        dummy = data.copy()
        dummy.index = dummy_index

        unstacked = dummy.unstack('__placeholder__', fill_value=fill_value)
        if isinstance(unstacked, Series):
            unstcols = unstacked.index
        else:
            unstcols = unstacked.columns
        new_levels = [unstcols.levels[0]] + clevels
        new_names = [data.columns.name] + cnames

        new_codes = [unstcols.codes[0]]
        for rec in recons_codes:
            new_codes.append(rec.take(unstcols.codes[-1]))

    new_columns = MultiIndex(levels=new_levels,
                             codes=new_codes,
                             names=new_names,
                             verify_integrity=False)

    if isinstance(unstacked, Series):
        unstacked.index = new_columns
    else:
        unstacked.columns = new_columns

    return unstacked
Example #33
0
 def _get_multi_index(self, mask, obs_ids):
     masked = [labels for _, labels in
               self._get_group_levels(mask, obs_ids)]
     names = [ping.name for ping in self.groupings]
     return MultiIndex.from_arrays(masked, names=names)
Example #34
0
    def test_equals(self):
        s1 = pd.Series([1, 2, 3], index=[0, 2, 1])
        s2 = s1.copy()
        self.assert_(s1.equals(s2))

        s1[1] = 99
        self.assert_(not s1.equals(s2))

        # NaNs compare as equal
        s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3])
        s2 = s1.copy()
        self.assert_(s1.equals(s2))

        s2[0] = 9.9
        self.assert_(not s1.equals(s2))

        idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')])
        s1 = Series([1, 2, np.nan], index=idx)
        s2 = s1.copy()
        self.assert_(s1.equals(s2))

        # Add object dtype column with nans
        index = np.random.random(10)
        df1 = DataFrame(np.random.random(10, ),
                        index=index,
                        columns=['floats'])
        df1['text'] = 'the sky is so blue. we could use more chocolate.'.split(
        )
        df1['start'] = date_range('2000-1-1', periods=10, freq='T')
        df1['end'] = date_range('2000-1-1', periods=10, freq='D')
        df1['diff'] = df1['end'] - df1['start']
        df1['bool'] = (np.arange(10) % 3 == 0)
        df1.ix[::2] = nan
        df2 = df1.copy()
        self.assert_(df1['text'].equals(df2['text']))
        self.assert_(df1['start'].equals(df2['start']))
        self.assert_(df1['end'].equals(df2['end']))
        self.assert_(df1['diff'].equals(df2['diff']))
        self.assert_(df1['bool'].equals(df2['bool']))
        self.assert_(df1.equals(df2))
        self.assert_(not df1.equals(object))

        # different dtype
        different = df1.copy()
        different['floats'] = different['floats'].astype('float32')
        self.assert_(not df1.equals(different))

        # different index
        different_index = -index
        different = df2.set_index(different_index)
        self.assert_(not df1.equals(different))

        # different columns
        different = df2.copy()
        different.columns = df2.columns[::-1]
        self.assert_(not df1.equals(different))

        # DatetimeIndex
        index = pd.date_range('2000-1-1', periods=10, freq='T')
        df1 = df1.set_index(index)
        df2 = df1.copy()
        self.assert_(df1.equals(df2))

        # MultiIndex
        df3 = df1.set_index(['text'], append=True)
        df2 = df1.set_index(['text'], append=True)
        self.assert_(df3.equals(df2))

        df2 = df1.set_index(['floats'], append=True)
        self.assert_(not df3.equals(df2))

        # NaN in index
        df3 = df1.set_index(['floats'], append=True)
        df2 = df1.set_index(['floats'], append=True)
        self.assert_(df3.equals(df2))
Example #35
0
 def test_rename_mi(self):
     s = Series([11, 21, 31],
                index=MultiIndex.from_tuples([("A", x)
                                              for x in ["a", "B", "c"]]))
     result = s.rename(str.lower)
Example #36
0
def _make_concat_multiindex(indexes, keys, levels=None, names=None):
    if ((levels is None and isinstance(keys[0], tuple))
            or (levels is not None and len(levels) > 1)):
        zipped = lzip(*keys)
        if names is None:
            names = [None] * len(zipped)

        if levels is None:
            levels = [
                Categorical.from_array(zp, ordered=True).categories
                for zp in zipped
            ]
        else:
            levels = [_ensure_index(x) for x in levels]
    else:
        zipped = [keys]
        if names is None:
            names = [None]

        if levels is None:
            levels = [_ensure_index(keys)]
        else:
            levels = [_ensure_index(x) for x in levels]

    if not _all_indexes_same(indexes):
        label_list = []

        # things are potentially different sizes, so compute the exact labels
        # for each level and pass those to MultiIndex.from_arrays

        for hlevel, level in zip(zipped, levels):
            to_concat = []
            for key, index in zip(hlevel, indexes):
                try:
                    i = level.get_loc(key)
                except KeyError:
                    raise ValueError('Key %s not in level %s' %
                                     (str(key), str(level)))

                to_concat.append(np.repeat(i, len(index)))
            label_list.append(np.concatenate(to_concat))

        concat_index = _concat_indexes(indexes)

        # these go at the end
        if isinstance(concat_index, MultiIndex):
            levels.extend(concat_index.levels)
            label_list.extend(concat_index.labels)
        else:
            factor = Categorical.from_array(concat_index, ordered=True)
            levels.append(factor.categories)
            label_list.append(factor.codes)

        if len(names) == len(levels):
            names = list(names)
        else:
            # make sure that all of the passed indices have the same nlevels
            if not len(set([i.nlevels for i in indexes])) == 1:
                raise AssertionError("Cannot concat indices that do"
                                     " not have the same number of levels")

            # also copies
            names = names + _get_consensus_names(indexes)

        return MultiIndex(levels=levels,
                          labels=label_list,
                          names=names,
                          verify_integrity=False)

    new_index = indexes[0]
    n = len(new_index)
    kpieces = len(indexes)

    # also copies
    new_names = list(names)
    new_levels = list(levels)

    # construct labels
    new_labels = []

    # do something a bit more speedy

    for hlevel, level in zip(zipped, levels):
        hlevel = _ensure_index(hlevel)
        mapped = level.get_indexer(hlevel)

        mask = mapped == -1
        if mask.any():
            raise ValueError('Values not found in passed level: %s' %
                             str(hlevel[mask]))

        new_labels.append(np.repeat(mapped, n))

    if isinstance(new_index, MultiIndex):
        new_levels.extend(new_index.levels)
        new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels])
    else:
        new_levels.append(new_index)
        new_labels.append(np.tile(np.arange(n), kpieces))

    if len(new_names) < len(new_levels):
        new_names.extend(new_index.names)

    return MultiIndex(levels=new_levels,
                      labels=new_labels,
                      names=new_names,
                      verify_integrity=False)
Example #37
0
def empty(types,
          size,
          cats=None,
          cols=None,
          index_types=None,
          index_names=None,
          timezones=None):
    """
    Create empty DataFrame to assign into

    In the simplest case, will return a Pandas dataframe of the given size,
    with columns of the given names and types. The second return value `views`
    is a dictionary of numpy arrays into which you can assign values that
    show up in the dataframe.

    For categorical columns, you get two views to assign into: if the
    column name is "col", you get both "col" (the category codes) and
    "col-catdef" (the category labels).

    For a single categorical index, you should use the `.set_categories`
    method of the appropriate "-catdef" columns, passing an Index of values

    ``views['index-catdef'].set_categories(pd.Index(newvalues), fastpath=True)``

    Multi-indexes work a lot like categoricals, even if the types of each
    index are not themselves categories, and will also have "-catdef" entries
    in the views. However, these will be Dummy instances, providing only a
    ``.set_categories`` method, to be used as above.

    Parameters
    ----------
    types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples
        applies to non-categorical columns. If there are only categorical
        columns, an empty string of None will do.
    size: int
        Number of rows to allocate
    cats: dict {col: labels}
        Location and labels for categorical columns, e.g., {1: ['mary', 'mo]}
        will create column index 1 (inserted amongst the numerical columns)
        with two possible values. If labels is an integers, `{'col': 5}`,
        will generate temporary labels using range. If None, or column name
        is missing, will assume 16-bit integers (a reasonable default).
    cols: list of labels
        assigned column names, including categorical ones.
    index_types: list of str
        For one of more index columns, make them have this type. See general
        description, above, for caveats about multi-indexing. If None, the
        index will be the default RangeIndex.
    index_names: list of str
        Names of the index column(s), if using
    timezones: dict {col: timezone_str}
        for timestamp type columns, apply this timezone to the pandas series;
        the numpy view will be UTC.

    Returns
    -------
    - dataframe with correct shape and data-types
    - list of numpy views, in order, of the columns of the dataframe. Assign
        to this.
    """
    views = {}
    timezones = timezones or {}

    if isinstance(types, STR_TYPE):
        types = types.split(',')
    cols = cols if cols is not None else range(len(types))

    def cat(col):
        if cats is None or col not in cats:
            return RangeIndex(0, 2**14)
        elif isinstance(cats[col], int):
            return RangeIndex(0, cats[col])
        else:  # explicit labels list
            return cats[col]

    df = OrderedDict()
    for t, col in zip(types, cols):
        if str(t) == 'category':
            df[six.text_type(col)] = Categorical([],
                                                 categories=cat(col),
                                                 fastpath=True)
        else:
            if hasattr(t, 'base'):
                # funky pandas not-dtype
                t = t.base
            d = np.empty(0, dtype=t)
            if d.dtype.kind == "M" and six.text_type(col) in timezones:
                try:
                    d = Series(d).dt.tz_localize(timezones[six.text_type(col)])
                except:
                    warnings.warn("Inferring time-zone from %s in column %s "
                                  "failed, using time-zone-agnostic"
                                  "" % (timezones[six.text_type(col)], col))
            df[six.text_type(col)] = d

    df = DataFrame(df)
    if not index_types:
        index = RangeIndex(size)
    elif len(index_types) == 1:
        t, col = index_types[0], index_names[0]
        if col is None:
            raise ValueError('If using an index, must give an index name')
        if str(t) == 'category':
            c = Categorical([], categories=cat(col), fastpath=True)
            vals = np.zeros(size, dtype=c.codes.dtype)
            index = CategoricalIndex(c)
            index._data._codes = vals
            views[col] = vals
            views[col + '-catdef'] = index._data
        else:
            if hasattr(t, 'base'):
                # funky pandas not-dtype
                t = t.base
            d = np.empty(size, dtype=t)
            if d.dtype.kind == "M" and six.text_type(col) in timezones:
                try:
                    d = Series(d).dt.tz_localize(timezones[six.text_type(col)])
                except:
                    warnings.warn("Inferring time-zone from %s in column %s "
                                  "failed, using time-zone-agnostic"
                                  "" % (timezones[six.text_type(col)], col))
            index = Index(d)
            views[col] = index.values
    else:
        index = MultiIndex([[]], [[]])
        # index = MultiIndex.from_arrays(indexes)
        index._levels = list()
        index._labels = list()
        index._codes = list()
        for i, col in enumerate(index_names):
            index._levels.append(Index([None]))

            def set_cats(values, i=i, col=col, **kwargs):
                values.name = col
                if index._levels[i][0] is None:
                    index._levels[i] = values
                elif not index._levels[i].equals(values):
                    raise RuntimeError("Different dictionaries encountered"
                                       " while building categorical")

            x = Dummy()
            x._set_categories = set_cats

            d = np.zeros(size, dtype=int)
            if LooseVersion(pdver) >= LooseVersion("0.24.0"):
                index._codes = list(index._codes) + [d]
            else:
                index._labels.append(d)
            views[col] = d
            views[col + '-catdef'] = x

    axes = [df._data.axes[0], index]

    # allocate and create blocks
    blocks = []
    for block in df._data.blocks:
        if block.is_categorical:
            categories = block.values.categories
            code = np.zeros(shape=size, dtype=block.values.codes.dtype)
            values = Categorical(values=code,
                                 categories=categories,
                                 fastpath=True)
            new_block = block.make_block_same_class(values=values)
        elif getattr(block.dtype, 'tz', None):
            new_shape = (size, )
            values = np.empty(shape=new_shape, dtype='M8[ns]')
            new_block = block.make_block_same_class(
                type(block.values)(values, dtype=block.values.dtype))
        else:
            new_shape = (block.values.shape[0], size)
            values = np.empty(shape=new_shape, dtype=block.values.dtype)
            new_block = block.make_block_same_class(values=values)

        blocks.append(new_block)

    # create block manager
    df = DataFrame(BlockManager(blocks, axes))

    # create views
    for block in df._data.blocks:
        dtype = block.dtype
        inds = block.mgr_locs.indexer
        if isinstance(inds, slice):
            inds = list(range(inds.start, inds.stop, inds.step))
        for i, ind in enumerate(inds):
            col = df.columns[ind]
            if is_categorical_dtype(dtype):
                views[col] = block.values._codes
                views[col + '-catdef'] = block.values
            elif getattr(block.dtype, 'tz', None):
                views[col] = np.asarray(block.values, dtype='M8[ns]')
            else:
                views[col] = block.values[i]

    if index_names:
        df.index.names = [
            None if re.match(r'__index_level_\d+__', n) else n
            for n in index_names
        ]
    return df, views
Example #38
0
 def test_format_integer_names(self):
     index = MultiIndex(levels=[[0, 1], [0, 1]],
                        labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
                        names=[0, 1])
     index.format(names=True)
Example #39
0
 def test_multilevel_consolidate(self):
     index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'),
                                     ('bar', 'one'), ('bar', 'two')])
     df = DataFrame(np.random.randn(4, 4), index=index, columns=index)
     df['Totals', ''] = df.sum(1)
     df = df.consolidate()
Example #40
0
    def setUp(self):
        """
        Setup the dataframes used for the groupby tests derived from pandas
        """
        self.dateRange = bdate_range('1/1/2005', periods=250)
        self.stringIndex = Index([rands(8).upper() for x in range(250)])

        self.groupId = Series([x[0] for x in self.stringIndex],
                              index=self.stringIndex)
        self.groupDict = dict(
            (k, v) for k, v in compat.iteritems(self.groupId))

        self.columnIndex = Index(['A', 'B', 'C', 'D', 'E'])

        randMat = np.random.randn(250, 5)
        self.stringMatrix = DataFrame(randMat,
                                      columns=self.columnIndex,
                                      index=self.stringIndex)

        self.timeMatrix = DataFrame(randMat,
                                    columns=self.columnIndex,
                                    index=self.dateRange)
        self.ts = tm.makeTimeSeries()

        self.seriesd = tm.getSeriesData()
        self.tsd = tm.getTimeSeriesData()
        self.frame = DataFrame(self.seriesd)
        self.tsframe = DataFrame(self.tsd)

        self.df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.random.randn(8)
        })

        self.df_mixed_floats = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.array(np.random.randn(8), dtype='float32')
        })

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        self.mframe = DataFrame(np.random.randn(10, 3),
                                index=index,
                                columns=['A', 'B', 'C'])

        self.three_group = DataFrame({
            'A': [
                'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo',
                'foo', 'foo'
            ],
            'B': [
                'one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two',
                'two', 'one'
            ],
            'C': [
                'dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
                'dull', 'shiny', 'shiny', 'shiny'
            ],
            'D':
            np.random.randn(11),
            'E':
            np.random.randn(11),
            'F':
            np.random.randn(11)
        })
        super(self.__class__, self).setUp()
Example #41
0
    def _wrap_result(self, result, use_codes=True, name=None, expand=None):

        # TODO: this was blindly copied from `strings.StringMethods._wrap_result` for noew
        from pandas.core.index import Index, MultiIndex

        # for category, we do the stuff on the categories, so blow it up
        # to the full series again
        # But for some operations, we have to do the stuff on the full values,
        # so make it possible to skip this step as the method already did this
        # before the transformation...
        # if use_codes and self._is_categorical:
        #    result = take_1d(result, self._orig.cat.codes)

        if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'):
            return result
        assert result.ndim < 3

        if expand is None:
            # infer from ndim if expand is not specified
            expand = False if result.ndim == 1 else True

        elif expand is True and not isinstance(self._orig, Index):
            # required when expand=True is explicitly specified
            # not needed when inferred

            def cons_row(x):
                if is_list_like(x):
                    return x
                else:
                    return [x]

            result = [cons_row(x) for x in result]
            if result:
                # propagate nan values to match longest sequence (GH 18450)
                max_len = max(len(x) for x in result)
                result = [
                    x * max_len if len(x) == 0 or x[0] is np.nan else x
                    for x in result
                ]

        if not isinstance(expand, bool):
            raise ValueError("expand must be True or False")

        if expand is False:
            # if expand is False, result should have the same name
            # as the original otherwise specified
            if name is None:
                name = getattr(result, 'name', None)
            if name is None:
                # do not use logical or, _orig may be a DataFrame
                # which has "name" column
                name = self._orig.name

        # Wait until we are sure result is a Series or Index before
        # checking attributes (GH 12180)
        if isinstance(self._orig, Index):
            # if result is a boolean np.array, return the np.array
            # instead of wrapping it into a boolean Index (GH 8875)
            if is_bool_dtype(result):
                return result

            if expand:
                result = list(result)
                out = MultiIndex.from_tuples(result, names=name)
                if out.nlevels == 1:
                    # We had all tuples of length-one, which are
                    # better represented as a regular Index.
                    out = out.get_level_values(0)
                return out
            else:
                return Index(result, name=name)
        else:
            index = self._orig.index
            if expand:
                cons = self._orig._constructor_expanddim
                return cons(result, columns=name, index=index)
            else:
                # Must be a Series
                cons = self._orig._constructor
                return cons(result, name=name, index=index)
Example #42
0
def _unstack_multiple(data, clocs):
    if len(clocs) == 0:
        return data

    # NOTE: This doesn't deal with hierarchical columns yet

    index = data.index

    clocs = [index._get_level_number(i) for i in clocs]

    rlocs = [i for i in range(index.nlevels) if i not in clocs]

    clevels = [index.levels[i] for i in clocs]
    clabels = [index.labels[i] for i in clocs]
    cnames = [index.names[i] for i in clocs]
    rlevels = [index.levels[i] for i in rlocs]
    rlabels = [index.labels[i] for i in rlocs]
    rnames = [index.names[i] for i in rlocs]

    shape = [len(x) for x in clevels]
    group_index = get_group_index(clabels, shape)

    comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
    recons_labels = decons_group_index(obs_ids, shape)

    dummy_index = MultiIndex(levels=rlevels + [obs_ids],
                             labels=rlabels + [comp_ids],
                             names=rnames + ['__placeholder__'])

    if isinstance(data, Series):
        dummy = Series(data.values, index=dummy_index)
        unstacked = dummy.unstack('__placeholder__')
        new_levels = clevels
        new_names = cnames
        new_labels = recons_labels
    else:
        if isinstance(data.columns, MultiIndex):
            raise NotImplementedError('Unstacking multiple levels with '
                                      'hierarchical columns not yet supported')

        dummy = DataFrame(data.values, index=dummy_index, columns=data.columns)

        unstacked = dummy.unstack('__placeholder__')
        if isinstance(unstacked, Series):
            unstcols = unstacked.index
        else:
            unstcols = unstacked.columns
        new_levels = [unstcols.levels[0]] + clevels
        new_names = [data.columns.name] + cnames

        new_labels = [unstcols.labels[0]]
        for rec in recons_labels:
            new_labels.append(rec.take(unstcols.labels[-1]))

    new_columns = MultiIndex(levels=new_levels,
                             labels=new_labels,
                             names=new_names)

    if isinstance(unstacked, Series):
        unstacked.index = new_columns
    else:
        unstacked.columns = new_columns

    return unstacked
Example #43
0
    def test_tz_convert_and_localize(self):
        l0 = date_range('20140701', periods=5, freq='D')

        # TODO: l1 should be a PeriodIndex for testing
        #       after GH2106 is addressed
        with tm.assertRaises(NotImplementedError):
            period_range('20140701', periods=1).tz_convert('UTC')
        with tm.assertRaises(NotImplementedError):
            period_range('20140701', periods=1).tz_localize('UTC')
        # l1 = period_range('20140701', periods=5, freq='D')
        l1 = date_range('20140701', periods=5, freq='D')

        int_idx = Index(range(5))

        for fn in ['tz_localize', 'tz_convert']:

            if fn == 'tz_convert':
                l0 = l0.tz_localize('UTC')
                l1 = l1.tz_localize('UTC')

            for idx in [l0, l1]:

                l0_expected  = getattr(idx, fn)('US/Pacific')
                l1_expected  = getattr(idx, fn)('US/Pacific')

                df1 = DataFrame(np.ones(5), index=l0)
                df1 = getattr(df1, fn)('US/Pacific')
                self.assertTrue(df1.index.equals(l0_expected))

                # MultiIndex
                # GH7846
                df2 = DataFrame(np.ones(5), 
                                MultiIndex.from_arrays([l0, l1]))

                df3 = getattr(df2, fn)('US/Pacific', level=0)
                self.assertFalse(df3.index.levels[0].equals(l0))
                self.assertTrue(df3.index.levels[0].equals(l0_expected))
                self.assertTrue(df3.index.levels[1].equals(l1))
                self.assertFalse(df3.index.levels[1].equals(l1_expected))

                df3 = getattr(df2, fn)('US/Pacific', level=1)
                self.assertTrue(df3.index.levels[0].equals(l0))
                self.assertFalse(df3.index.levels[0].equals(l0_expected))
                self.assertTrue(df3.index.levels[1].equals(l1_expected))
                self.assertFalse(df3.index.levels[1].equals(l1))

                df4 = DataFrame(np.ones(5),
                                MultiIndex.from_arrays([int_idx, l0]))

                df5 = getattr(df4, fn)('US/Pacific', level=1)
                self.assertTrue(df3.index.levels[0].equals(l0))
                self.assertFalse(df3.index.levels[0].equals(l0_expected))
                self.assertTrue(df3.index.levels[1].equals(l1_expected))
                self.assertFalse(df3.index.levels[1].equals(l1))

        # Bad Inputs
        for fn in ['tz_localize', 'tz_convert']:
            # Not DatetimeIndex / PeriodIndex
            with tm.assertRaisesRegexp(TypeError, 'DatetimeIndex'):
                df = DataFrame(index=int_idx)
                df = getattr(df, fn)('US/Pacific')

            # Not DatetimeIndex / PeriodIndex
            with tm.assertRaisesRegexp(TypeError, 'DatetimeIndex'):
                df = DataFrame(np.ones(5),
                            MultiIndex.from_arrays([int_idx, l0]))
                df = getattr(df, fn)('US/Pacific', level=0)

            # Invalid level
            with tm.assertRaisesRegexp(ValueError, 'not valid'):
                df = DataFrame(index=l0)
                df = getattr(df, fn)('US/Pacific', level=1)
Example #44
0
def empty(types,
          size,
          cats=None,
          cols=None,
          index_types=None,
          index_names=None,
          timezones=None):
    """
    Create empty DataFrame to assign into

    Parameters
    ----------
    types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples
        applies to non-categorical columns. If there are only categorical
        columns, an empty string of None will do.
    size: int
        Number of rows to allocate
    cats: dict {col: labels}
        Location and labels for categorical columns, e.g., {1: ['mary', 'mo]}
        will create column index 1 (inserted amongst the numerical columns)
        with two possible values. If labels is an integers, `{'col': 5}`,
        will generate temporary labels using range. If None, or column name
        is missing, will assume 16-bit integers (a reasonable default).
    cols: list of labels
        assigned column names, including categorical ones.
    timezones: dict {col: timezone_str}
        for timestamp type columns, apply this timezone to the pandas series;
        the numpy view will be UTC.

    Returns
    -------
    - dataframe with correct shape and data-types
    - list of numpy views, in order, of the columns of the dataframe. Assign
        to this.
    """
    views = {}
    timezones = timezones or {}

    if isinstance(types, STR_TYPE):
        types = types.split(',')
    cols = cols if cols is not None else range(len(types))

    def cat(col):
        if cats is None or col not in cats:
            return RangeIndex(0, 2**14)
        elif isinstance(cats[col], int):
            return RangeIndex(0, cats[col])
        else:  # explicit labels list
            return cats[col]

    df = OrderedDict()
    for t, col in zip(types, cols):
        if str(t) == 'category':
            df[str(col)] = Categorical([], categories=cat(col), fastpath=True)
        else:
            d = np.empty(0, dtype=t)
            if d.dtype.kind == "M" and str(col) in timezones:
                d = Series(d).dt.tz_localize(timezones[str(col)])
            df[str(col)] = d

    df = DataFrame(df)
    if not index_types:
        index = RangeIndex(size)
    elif len(index_types) == 1:
        t, col = index_types[0], index_names[0]
        if col is None:
            raise ValueError('If using an index, must give an index name')
        if str(t) == 'category':
            c = Categorical([], categories=cat(col), fastpath=True)
            vals = np.zeros(size, dtype=c.codes.dtype)
            index = CategoricalIndex(c)
            index._data._codes = vals
            views[col] = vals
            views[col + '-catdef'] = index._data
        else:
            d = np.empty(size, dtype=t)
            # if d.dtype.kind == "M" and str(col) in timezones:
            #     d = Series(d).dt.tz_localize(timezones[str(col)])
            index = Index(d)
            views[col] = index.values
    else:
        index = MultiIndex([[]], [[]])
        # index = MultiIndex.from_arrays(indexes)
        index._levels = list()
        index._labels = list()
        for i, col in enumerate(index_names):
            if str(index_types[i]) == 'category':
                c = Categorical([], categories=cat(col), fastpath=True)
                z = CategoricalIndex(c)
                z._data._codes = c.categories._data
                z._set_categories = c._set_categories
                index._levels.append(z)

                vals = np.zeros(size, dtype=c.codes.dtype)
                index._labels.append(vals)

                views[col] = index._labels[i]
                views[col + '-catdef'] = index._levels[i]
            else:
                d = np.empty(size, dtype=index_types[i])
                # if d.dtype.kind == "M" and str(col) in timezones:
                #     d = Series(d).dt.tz_localize(timezones[str(col)])
                index._levels.append(Index(d))
                index._labels.append(np.arange(size, dtype=int))
                views[col] = index._levels[i]._data

    axes = [df._data.axes[0], index]

    # allocate and create blocks
    blocks = []
    for block in df._data.blocks:
        if block.is_categorical:
            categories = block.values.categories
            code = np.zeros(shape=size, dtype=block.values.codes.dtype)
            values = Categorical(values=code,
                                 categories=categories,
                                 fastpath=True)
            new_block = block.make_block_same_class(values=values)
        elif getattr(block.dtype, 'tz', None):
            new_shape = (size, )
            values = np.empty(shape=new_shape, dtype=block.values.values.dtype)
            new_block = block.make_block_same_class(values=values,
                                                    dtype=block.values.dtype)
        else:
            new_shape = (block.values.shape[0], size)
            values = np.empty(shape=new_shape, dtype=block.values.dtype)
            new_block = block.make_block_same_class(values=values)

        blocks.append(new_block)

    # create block manager
    df = DataFrame(BlockManager(blocks, axes))

    # create views
    for block in df._data.blocks:
        dtype = block.dtype
        inds = block.mgr_locs.indexer
        if isinstance(inds, slice):
            inds = list(range(inds.start, inds.stop, inds.step))
        for i, ind in enumerate(inds):
            col = df.columns[ind]
            if is_categorical_dtype(dtype):
                views[col] = block.values._codes
                views[col + '-catdef'] = block.values
            elif getattr(block.dtype, 'tz', None):
                views[col] = block.values.values
            else:
                views[col] = block.values[i]

    if index_names:
        df.index.names = [
            None if re.match(r'__index_level_\d+__', n) else n
            for n in index_names
        ]
    return df, views
Example #45
0
    def get_chunk(self, rows=None):
        if rows is not None and self.skip_footer:
            raise ValueError('skip_footer not supported for iteration')

        try:
            content = self._get_lines(rows)
        except StopIteration:
            if self._first_chunk:
                content = []
            else:
                raise

        # done with first read, next time raise StopIteration
        self._first_chunk = False

        if len(content) == 0:  # pragma: no cover
            if self.index_col is not None:
                if np.isscalar(self.index_col):
                    index = Index([], name=self.index_name)
                else:
                    index = MultiIndex.from_arrays([[]] * len(self.index_col),
                                                   names=self.index_name)
            else:
                index = Index([])

            return DataFrame(index=index, columns=self.columns)

        zipped_content = list(lib.to_object_array(content).T)

        if not self._has_complex_date_col and self.index_col is not None:
            index = self._get_simple_index(zipped_content)
            index = self._agg_index(index)
        else:
            index = Index(np.arange(len(content)))

        col_len, zip_len = len(self.columns), len(zipped_content)
        if col_len != zip_len:
            row_num = -1
            for (i, l) in enumerate(content):
                if len(l) != col_len:
                    break

            footers = 0
            if self.skip_footer:
                footers = self.skip_footer
            row_num = self.pos - (len(content) - i + footers)

            msg = ('Expecting %d columns, got %d in row %d' %
                   (col_len, zip_len, row_num))
            raise ValueError(msg)

        data = dict((k, v) for k, v in izip(self.columns, zipped_content))

        # apply converters
        for col, f in self.converters.iteritems():
            if isinstance(col, int) and col not in self.columns:
                col = self.columns[col]
            data[col] = lib.map_infer(data[col], f)

        columns = list(self.columns)
        if self.parse_dates is not None:
            data, columns = self._process_date_conversion(data)

        data = _convert_to_ndarrays(data, self.na_values, self.verbose)

        df = DataFrame(data=data, columns=columns, index=index)
        if self._has_complex_date_col and self.index_col is not None:
            if not self._name_processed:
                self.index_name = self._get_index_name(list(columns))
                self._name_processed = True
            data = dict(((k, v) for k, v in df.iteritems()))
            index = self._get_complex_date_index(data,
                                                 col_names=columns,
                                                 parse_dates=False)
            index = self._agg_index(index, False)
            data = dict(((k, v.values) for k, v in data.iteritems()))
            df = DataFrame(data=data, columns=columns, index=index)

        if self.squeeze and len(df.columns) == 1:
            return df[df.columns[0]]
        return df
Example #46
0
class TestMultiIndex(unittest.TestCase):
    def setUp(self):
        major_axis = Index(['foo', 'bar', 'baz', 'qux'])
        minor_axis = Index(['one', 'two'])

        major_labels = np.array([0, 0, 1, 2, 3, 3])
        minor_labels = np.array([0, 1, 0, 1, 0, 1])

        self.index = MultiIndex(levels=[major_axis, minor_axis],
                                labels=[major_labels, minor_labels],
                                names=['first', 'second'])

    def test_constructor_single_level(self):
        single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
                                  labels=[[0, 1, 2, 3]],
                                  names=['first'])
        self.assert_(isinstance(single_level, Index))
        self.assert_(not isinstance(single_level, MultiIndex))
        self.assert_(single_level.name == 'first')

        single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
                                  labels=[[0, 1, 2, 3]])
        self.assert_(single_level.name is None)

    def test_constructor_no_levels(self):
        self.assertRaises(Exception, MultiIndex, levels=[], labels=[])

    def test_duplicate_names(self):
        self.index.names = ['foo', 'foo']
        self.assertRaises(Exception, self.index._get_level_number, 'foo')

    def test_from_arrays(self):
        arrays = []
        for lev, lab in zip(self.index.levels, self.index.labels):
            arrays.append(np.asarray(lev).take(lab))

        result = MultiIndex.from_arrays(arrays)
        self.assertEquals(list(result), list(self.index))

    def test_append(self):
        result = self.index[:3].append(self.index[3:])
        self.assert_(result.equals(self.index))

        foos = [self.index[:1], self.index[1:3], self.index[3:]]
        result = foos[0].append(foos[1:])
        self.assert_(result.equals(self.index))

        # empty
        result = self.index.append([])
        self.assert_(result.equals(self.index))

    def test_get_level_values(self):
        result = self.index.get_level_values(0)
        expected = ['foo', 'foo', 'bar', 'baz', 'qux', 'qux']
        self.assert_(np.array_equal(result, expected))

        result = self.index.get_level_values('first')
        expected = self.index.get_level_values(0)
        self.assert_(np.array_equal(result, expected))

    def test_nlevels(self):
        self.assertEquals(self.index.nlevels, 2)

    def test_iter(self):
        result = list(self.index)
        expected = [('foo', 'one'), ('foo', 'two'), ('bar', 'one'),
                    ('baz', 'two'), ('qux', 'one'), ('qux', 'two')]
        self.assert_(result == expected)

    def test_pickle(self):
        pickled = pickle.dumps(self.index)
        unpickled = pickle.loads(pickled)
        self.assert_(self.index.equals(unpickled))

    def test_legacy_pickle(self):
        import os

        def curpath():
            pth, _ = os.path.split(os.path.abspath(__file__))
            return pth

        ppath = os.path.join(curpath(), 'data/multiindex_v1.pickle')
        obj = pickle.load(open(ppath, 'r'))

        self.assert_(obj._is_legacy_format)

        obj2 = MultiIndex.from_tuples(obj.values)
        self.assert_(obj.equals(obj2))

        res = obj.get_indexer(obj2[::-1])
        exp = obj.get_indexer(obj[::-1])
        exp2 = obj2.get_indexer(obj2[::-1])
        assert_almost_equal(res, exp)
        assert_almost_equal(exp, exp2)

    def test_contains(self):
        self.assert_(('foo', 'two') in self.index)
        self.assert_(('bar', 'two') not in self.index)
        self.assert_(None not in self.index)

    def test_is_all_dates(self):
        self.assert_(not self.index.is_all_dates)

    def test_getitem(self):
        # scalar
        self.assertEquals(self.index[2], ('bar', 'one'))

        # slice
        result = self.index[2:5]
        expected = self.index[[2, 3, 4]]
        self.assert_(result.equals(expected))

        # boolean
        result = self.index[[True, False, True, False, True, True]]
        result2 = self.index[np.array([True, False, True, False, True, True])]
        expected = self.index[[0, 2, 4, 5]]
        self.assert_(result.equals(expected))
        self.assert_(result2.equals(expected))

    def test_getitem_group_select(self):
        sorted_idx, _ = self.index.sortlevel(0)
        self.assertEquals(sorted_idx.get_loc('baz'), slice(3, 4))
        self.assertEquals(sorted_idx.get_loc('foo'), slice(0, 2))

    def test_get_loc(self):
        self.assert_(self.index.get_loc(('foo', 'two')) == 1)
        self.assert_(self.index.get_loc(('baz', 'two')) == 3)
        self.assertRaises(KeyError, self.index.get_loc, ('bar', 'two'))
        self.assertRaises(KeyError, self.index.get_loc, 'quux')

        # 3 levels
        index = MultiIndex(
            levels=[Index(range(4)),
                    Index(range(4)),
                    Index(range(4))],
            labels=[
                np.array([0, 0, 1, 2, 2, 2, 3, 3]),
                np.array([0, 1, 0, 0, 0, 1, 0, 1]),
                np.array([1, 0, 1, 1, 0, 0, 1, 0])
            ])
        self.assertRaises(KeyError, index.get_loc, (1, 1))
        self.assert_(index.get_loc((2, 0)) == slice(3, 5))

    def test_get_loc_duplicates(self):
        index = Index([2, 2, 2, 2])
        self.assertRaises(Exception, index.get_loc, 2)

    def test_slice_locs(self):
        df = tm.makeTimeDataFrame()
        stacked = df.stack()

        idx = stacked.index

        slob = slice(*idx.slice_locs(df.index[5], df.index[15]))
        sliced = stacked[slob]
        expected = df[5:16].stack()
        tm.assert_almost_equal(sliced.values, expected.values)

        slob = slice(*idx.slice_locs(df.index[5] +
                                     timedelta(seconds=30), df.index[15] -
                                     timedelta(seconds=30)))
        sliced = stacked[slob]
        expected = df[6:15].stack()
        tm.assert_almost_equal(sliced.values, expected.values)

    def test_slice_locs_not_sorted(self):
        index = MultiIndex(
            levels=[Index(range(4)),
                    Index(range(4)),
                    Index(range(4))],
            labels=[
                np.array([0, 0, 1, 2, 2, 2, 3, 3]),
                np.array([0, 1, 0, 0, 0, 1, 0, 1]),
                np.array([1, 0, 1, 1, 0, 0, 1, 0])
            ])

        self.assertRaises(Exception, index.slice_locs, (1, 0, 1), (2, 1, 0))

        # works
        sorted_index, _ = index.sortlevel(0)
        result = sorted_index.slice_locs((1, 0, 1), (2, 1, 0))

    def test_slice_locs_partial(self):
        sorted_idx, _ = self.index.sortlevel(0)

        result = sorted_idx.slice_locs(('foo', 'two'), ('qux', 'one'))
        self.assertEquals(result, (1, 5))

        result = sorted_idx.slice_locs(None, ('qux', 'one'))
        self.assertEquals(result, (0, 5))

        result = sorted_idx.slice_locs(('foo', 'two'), None)
        self.assertEquals(result, (1, len(sorted_idx)))

        result = sorted_idx.slice_locs('bar', 'baz')
        self.assertEquals(result, (2, 4))

    def test_slice_locs_not_contained(self):
        # some searchsorted action

        index = MultiIndex(levels=[[0, 2, 4, 6], [0, 2, 4]],
                           labels=[[0, 0, 0, 1, 1, 2, 3, 3, 3],
                                   [0, 1, 2, 1, 2, 2, 0, 1, 2]],
                           sortorder=0)

        result = index.slice_locs((1, 0), (5, 2))
        self.assertEquals(result, (3, 6))

        result = index.slice_locs(1, 5)
        self.assertEquals(result, (3, 6))

        result = index.slice_locs((2, 2), (5, 2))
        self.assertEquals(result, (3, 6))

        result = index.slice_locs(2, 5)
        self.assertEquals(result, (3, 6))

        result = index.slice_locs((1, 0), (6, 3))
        self.assertEquals(result, (3, 8))

        result = index.slice_locs(-1, 10)
        self.assertEquals(result, (0, len(index)))

    def test_consistency(self):
        # need to construct an overflow
        major_axis = range(70000)
        minor_axis = range(10)

        major_labels = np.arange(70000)
        minor_labels = np.repeat(range(10), 7000)

        # the fact that is works means it's consistent
        index = MultiIndex(levels=[major_axis, minor_axis],
                           labels=[major_labels, minor_labels])

        # inconsistent
        major_labels = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3])
        minor_labels = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1])
        index = MultiIndex(levels=[major_axis, minor_axis],
                           labels=[major_labels, minor_labels])

        self.assertRaises(Exception, getattr, index, 'indexMap')

    def test_truncate(self):
        major_axis = Index(range(4))
        minor_axis = Index(range(2))

        major_labels = np.array([0, 0, 1, 2, 3, 3])
        minor_labels = np.array([0, 1, 0, 1, 0, 1])

        index = MultiIndex(levels=[major_axis, minor_axis],
                           labels=[major_labels, minor_labels])

        result = index.truncate(before=1)
        self.assert_('foo' not in result.levels[0])
        self.assert_(1 in result.levels[0])

        result = index.truncate(after=1)
        self.assert_(2 not in result.levels[0])
        self.assert_(1 in result.levels[0])

        result = index.truncate(before=1, after=2)
        self.assertEqual(len(result.levels[0]), 2)

        # after < before
        self.assertRaises(ValueError, index.truncate, 3, 1)

    def test_get_indexer(self):
        major_axis = Index(range(4))
        minor_axis = Index(range(2))

        major_labels = np.array([0, 0, 1, 2, 2, 3, 3])
        minor_labels = np.array([0, 1, 0, 0, 1, 0, 1])

        index = MultiIndex(levels=[major_axis, minor_axis],
                           labels=[major_labels, minor_labels])
        idx1 = index[:5]
        idx2 = index[[1, 3, 5]]

        r1 = idx1.get_indexer(idx2)
        assert_almost_equal(r1, [1, 3, -1])

        r1 = idx2.get_indexer(idx1, method='pad')
        assert_almost_equal(r1, [-1, 0, 0, 1, 1])

        rffill1 = idx2.get_indexer(idx1, method='ffill')
        assert_almost_equal(r1, rffill1)

        r1 = idx2.get_indexer(idx1, method='backfill')
        assert_almost_equal(r1, [0, 0, 1, 1, 2])

        rbfill1 = idx2.get_indexer(idx1, method='bfill')
        assert_almost_equal(r1, rbfill1)

        # pass non-MultiIndex
        r1 = idx1.get_indexer(idx2.get_tuple_index())
        rexp1 = idx1.get_indexer(idx2)
        assert_almost_equal(r1, rexp1)

        # self.assertRaises(Exception, idx1.get_indexer,
        #                   list(list(zip(*idx2.get_tuple_index()))[0]))

    def test_format(self):
        self.index.format()
        self.index[:0].format()

    def test_format_integer_names(self):
        index = MultiIndex(levels=[[0, 1], [0, 1]],
                           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
                           names=[0, 1])
        index.format(names=True)

    def test_bounds(self):
        self.index._bounds

    def test_equals(self):
        self.assert_(self.index.equals(self.index))
        self.assert_(self.index.equal_levels(self.index))

        self.assert_(not self.index.equals(self.index[:-1]))

        self.assert_(self.index.equals(self.index.get_tuple_index()))

        # different number of levels
        index = MultiIndex(
            levels=[Index(range(4)),
                    Index(range(4)),
                    Index(range(4))],
            labels=[
                np.array([0, 0, 1, 2, 2, 2, 3, 3]),
                np.array([0, 1, 0, 0, 0, 1, 0, 1]),
                np.array([1, 0, 1, 1, 0, 0, 1, 0])
            ])

        index2 = MultiIndex(levels=index.levels[:-1], labels=index.labels[:-1])
        self.assert_(not index.equals(index2))
        self.assert_(not index.equal_levels(index2))

        # levels are different
        major_axis = Index(range(4))
        minor_axis = Index(range(2))

        major_labels = np.array([0, 0, 1, 2, 2, 3])
        minor_labels = np.array([0, 1, 0, 0, 1, 0])

        index = MultiIndex(levels=[major_axis, minor_axis],
                           labels=[major_labels, minor_labels])
        self.assert_(not self.index.equals(index))
        self.assert_(not self.index.equal_levels(index))

        # some of the labels are different
        major_axis = Index(['foo', 'bar', 'baz', 'qux'])
        minor_axis = Index(['one', 'two'])

        major_labels = np.array([0, 0, 2, 2, 3, 3])
        minor_labels = np.array([0, 1, 0, 1, 0, 1])

        index = MultiIndex(levels=[major_axis, minor_axis],
                           labels=[major_labels, minor_labels])
        self.assert_(not self.index.equals(index))

    def test_union(self):
        piece1 = self.index[:5][::-1]
        piece2 = self.index[3:]

        the_union = piece1 | piece2

        tups = sorted(self.index.get_tuple_index())
        expected = MultiIndex.from_tuples(tups)

        self.assert_(the_union.equals(expected))

        # corner case, pass self or empty thing:
        the_union = self.index.union(self.index)
        self.assert_(the_union is self.index)

        the_union = self.index.union(self.index[:0])
        self.assert_(the_union is self.index)

        # won't work in python 3
        # tuples = self.index.get_tuple_index()
        # result = self.index[:4] | tuples[4:]
        # self.assert_(result.equals(tuples))

    # not valid for python 3
    # def test_union_with_regular_index(self):
    #     other = Index(['A', 'B', 'C'])

    #     result = other.union(self.index)
    #     self.assert_(('foo', 'one') in result)
    #     self.assert_('B' in result)

    #     result2 = self.index.union(other)
    #     self.assert_(result.equals(result2))

    def test_intersection(self):
        piece1 = self.index[:5][::-1]
        piece2 = self.index[3:]

        the_int = piece1 & piece2
        tups = sorted(self.index[3:5].get_tuple_index())
        expected = MultiIndex.from_tuples(tups)
        self.assert_(the_int.equals(expected))

        # corner case, pass self
        the_int = self.index.intersection(self.index)
        self.assert_(the_int is self.index)

        # empty intersection: disjoint
        empty = self.index[:2] & self.index[2:]
        expected = self.index[:0]
        self.assert_(empty.equals(expected))

        # can't do in python 3
        # tuples = self.index.get_tuple_index()
        # result = self.index & tuples
        # self.assert_(result.equals(tuples))

    def test_diff(self):
        first = self.index
        result = first - self.index[-3:]
        expected = MultiIndex.from_tuples(sorted(self.index[:-3].values),
                                          sortorder=0,
                                          names=self.index.names)

        self.assert_(isinstance(result, MultiIndex))
        self.assert_(result.equals(expected))
        self.assertEqual(result.names, self.index.names)

        # empty difference: reflexive
        result = self.index - self.index
        expected = self.index[:0]
        self.assert_(result.equals(expected))
        self.assertEqual(result.names, self.index.names)

        # empty difference: superset
        result = self.index[-3:] - self.index
        expected = self.index[:0]
        self.assert_(result.equals(expected))
        self.assertEqual(result.names, self.index.names)

        # empty difference: degenerate
        result = self.index[:0] - self.index
        expected = self.index[:0]
        self.assert_(result.equals(expected))
        self.assertEqual(result.names, self.index.names)

        # names not the same
        chunklet = self.index[-3:]
        chunklet.names = ['foo', 'baz']
        result = first - chunklet
        self.assertEqual(result.names, [None, None])

        # empty, but non-equal
        result = self.index - self.index.sortlevel(1)[0]
        self.assert_(len(result) == 0)

        # raise Exception called with non-MultiIndex
        self.assertRaises(Exception, first.diff, first.get_tuple_index())

    def test_from_tuples(self):
        self.assertRaises(Exception, MultiIndex.from_tuples, [])

    def test_argsort(self):
        result = self.index.argsort()
        expected = self.index.get_tuple_index().argsort()
        self.assert_(np.array_equal(result, expected))

    def test_sortlevel(self):
        import random

        tuples = list(self.index)
        random.shuffle(tuples)

        index = MultiIndex.from_tuples(tuples)

        sorted_idx, _ = index.sortlevel(0)
        expected = MultiIndex.from_tuples(sorted(tuples))
        self.assert_(sorted_idx.equals(expected))

        sorted_idx, _ = index.sortlevel(0, ascending=False)
        self.assert_(sorted_idx.equals(expected[::-1]))

        sorted_idx, _ = index.sortlevel(1)
        by1 = sorted(tuples, key=lambda x: (x[1], x[0]))
        expected = MultiIndex.from_tuples(by1)
        self.assert_(sorted_idx.equals(expected))

        sorted_idx, _ = index.sortlevel(1, ascending=False)
        self.assert_(sorted_idx.equals(expected[::-1]))

    def test_dims(self):
        pass

    def test_drop(self):
        dropped = self.index.drop([('foo', 'two'), ('qux', 'one')])

        index = MultiIndex.from_tuples([('foo', 'two'), ('qux', 'one')])
        dropped2 = self.index.drop(index)

        expected = self.index[[0, 2, 3, 5]]
        self.assert_(dropped.equals(expected))
        self.assert_(dropped2.equals(expected))

        dropped = self.index.drop(['bar'])
        expected = self.index[[0, 1, 3, 4, 5]]
        self.assert_(dropped.equals(expected))

        index = MultiIndex.from_tuples([('bar', 'two')])
        self.assertRaises(Exception, self.index.drop, [('bar', 'two')])
        self.assertRaises(Exception, self.index.drop, index)

        # mixed partial / full drop
        dropped = self.index.drop(['foo', ('qux', 'one')])
        expected = self.index[[2, 3, 5]]
        self.assert_(dropped.equals(expected))

    def test_droplevel_with_names(self):
        index = self.index[self.index.get_loc('foo')]
        dropped = index.droplevel(0)
        self.assertEqual(dropped.name, 'second')

        index = MultiIndex(
            levels=[Index(range(4)),
                    Index(range(4)),
                    Index(range(4))],
            labels=[
                np.array([0, 0, 1, 2, 2, 2, 3, 3]),
                np.array([0, 1, 0, 0, 0, 1, 0, 1]),
                np.array([1, 0, 1, 1, 0, 0, 1, 0])
            ],
            names=['one', 'two', 'three'])
        dropped = index.droplevel(0)
        self.assertEqual(dropped.names, ['two', 'three'])

    def test_insert(self):
        # key contained in all levels
        new_index = self.index.insert(0, ('bar', 'two'))
        self.assert_(new_index.equal_levels(self.index))
        self.assert_(new_index[0] == ('bar', 'two'))

        # key not contained in all levels
        new_index = self.index.insert(0, ('abc', 'three'))
        self.assert_(
            np.array_equal(new_index.levels[0],
                           list(self.index.levels[0]) + ['abc']))
        self.assert_(
            np.array_equal(new_index.levels[1],
                           list(self.index.levels[1]) + ['three']))
        self.assert_(new_index[0] == ('abc', 'three'))

        # key wrong length
        self.assertRaises(Exception, self.index.insert, 0, ('foo2', ))

    def test_take_preserve_name(self):
        taken = self.index.take([3, 0, 1])
        self.assertEqual(taken.names, self.index.names)

    def test_join_level(self):
        def _check_how(other, how):
            join_index, lidx, ridx = other.join(self.index,
                                                how=how,
                                                level='second',
                                                return_indexers=True)

            exp_level = other.join(self.index.levels[1], how=how)
            self.assert_(join_index.levels[0].equals(self.index.levels[0]))
            self.assert_(join_index.levels[1].equals(exp_level))

            # pare down levels
            mask = np.array([x[1] in exp_level for x in self.index],
                            dtype=bool)
            exp_values = self.index.values[mask]
            self.assert_(np.array_equal(join_index.values, exp_values))

            if how in ('outer', 'inner'):
                join_index2, ridx2, lidx2 = \
                    self.index.join(other, how=how, level='second',
                                    return_indexers=True)

                self.assert_(join_index.equals(join_index2))
                self.assert_(np.array_equal(lidx, lidx2))
                self.assert_(np.array_equal(ridx, ridx2))
                self.assert_(np.array_equal(join_index2.values, exp_values))

        def _check_all(other):
            _check_how(other, 'outer')
            _check_how(other, 'inner')
            _check_how(other, 'left')
            _check_how(other, 'right')

        _check_all(Index(['three', 'one', 'two']))
        _check_all(Index(['one']))
        _check_all(Index(['one', 'three']))

        # some corner cases
        idx = Index(['three', 'one', 'two'])
        result = idx.join(self.index, level='second')
        self.assert_(isinstance(result, MultiIndex))

        self.assertRaises(Exception, self.index.join, self.index, level=1)

    def test_reindex(self):
        result, indexer = self.index.reindex(list(self.index[:4]))
        self.assert_(isinstance(result, MultiIndex))

        result, indexer = self.index.reindex(list(self.index))
        self.assert_(isinstance(result, MultiIndex))
        self.assert_(indexer is None)

    def test_reindex_level(self):
        idx = Index(['one'])

        target, indexer = self.index.reindex(idx, level='second')
        target2, indexer2 = idx.reindex(self.index, idx, level='second')

        exp_index = self.index.join(idx, level='second', how='left')
        self.assert_(target.equals(exp_index))
        self.assert_(target2.equals(exp_index))

    def test_has_duplicates(self):
        self.assert_(not self.index.has_duplicates)
        self.assert_(self.index.append(self.index).has_duplicates)

        index = MultiIndex(levels=[[0, 1], [0, 1, 2]],
                           labels=[[0, 0, 0, 0, 1, 1, 1],
                                   [0, 1, 2, 0, 0, 1, 2]])
        self.assert_(index.has_duplicates)
Example #47
0
def _stack_multi_columns(frame, level=-1, dropna=True):
    this = frame.copy()

    # this makes life much simpler
    if level != frame.columns.nlevels - 1:
        # roll levels to put selected level at end
        roll_columns = this.columns
        for i in range(level, frame.columns.nlevels - 1):
            roll_columns = roll_columns.swaplevel(i, i + 1)
        this.columns = roll_columns

    if not this.columns.is_lexsorted():
        this = this.sortlevel(0, axis=1)

    # tuple list excluding level for grouping columns
    if len(frame.columns.levels) > 2:
        tuples = list(
            zip(*[
                lev.take(lab) for lev, lab in zip(this.columns.levels[:-1],
                                                  this.columns.labels[:-1])
            ]))
        unique_groups = [key for key, _ in itertools.groupby(tuples)]
        new_names = this.columns.names[:-1]
        new_columns = MultiIndex.from_tuples(unique_groups, names=new_names)
    else:
        new_columns = unique_groups = this.columns.levels[0]

    # time to ravel the values
    new_data = {}
    level_vals = this.columns.levels[-1]
    levsize = len(level_vals)
    drop_cols = []
    for key in unique_groups:
        loc = this.columns.get_loc(key)
        slice_len = loc.stop - loc.start
        # can make more efficient?

        if slice_len == 0:
            drop_cols.append(key)
            continue
        elif slice_len != levsize:
            chunk = this.ix[:, this.columns[loc]]
            chunk.columns = level_vals.take(chunk.columns.labels[-1])
            value_slice = chunk.reindex(columns=level_vals).values
        else:
            if frame._is_mixed_type:
                value_slice = this.ix[:, this.columns[loc]].values
            else:
                value_slice = this.values[:, loc]

        new_data[key] = value_slice.ravel()

    if len(drop_cols) > 0:
        new_columns = new_columns - drop_cols

    N = len(this)

    if isinstance(this.index, MultiIndex):
        new_levels = list(this.index.levels)
        new_names = list(this.index.names)
        new_labels = [lab.repeat(levsize) for lab in this.index.labels]
    else:
        new_levels = [this.index]
        new_labels = [np.arange(N).repeat(levsize)]
        new_names = [this.index.name]  # something better?

    new_levels.append(frame.columns.levels[level])
    new_labels.append(np.tile(np.arange(levsize), N))
    new_names.append(frame.columns.names[level])

    new_index = MultiIndex(levels=new_levels,
                           labels=new_labels,
                           names=new_names,
                           verify_integrity=False)

    result = DataFrame(new_data, index=new_index, columns=new_columns)

    # more efficient way to go about this? can do the whole masking biz but
    # will only save a small amount of time...
    if dropna:
        result = result.dropna(axis=0, how='all')

    return result
Example #48
0
def _stack_multi_columns(frame, level_num=-1, dropna=True):
    def _convert_level_number(level_num, columns):
        """
        Logic for converting the level number to something we can safely pass
        to swaplevel:

        We generally want to convert the level number into a level name, except
        when columns do not have names, in which case we must leave as a level
        number
        """
        if level_num in columns.names:
            return columns.names[level_num]
        else:
            if columns.names[level_num] is None:
                return level_num
            else:
                return columns.names[level_num]

    this = frame.copy()

    # this makes life much simpler
    if level_num != frame.columns.nlevels - 1:
        # roll levels to put selected level at end
        roll_columns = this.columns
        for i in range(level_num, frame.columns.nlevels - 1):
            # Need to check if the ints conflict with level names
            lev1 = _convert_level_number(i, roll_columns)
            lev2 = _convert_level_number(i + 1, roll_columns)
            roll_columns = roll_columns.swaplevel(lev1, lev2)
        this.columns = roll_columns

    if not this.columns.is_lexsorted():
        # Workaround the edge case where 0 is one of the column names,
        # which interferes with trying to sort based on the first
        # level
        level_to_sort = _convert_level_number(0, this.columns)
        this = this.sort_index(level=level_to_sort, axis=1)

    # tuple list excluding level for grouping columns
    if len(frame.columns.levels) > 2:
        tuples = list(
            zip(*[
                lev.take(level_codes) for lev, level_codes in zip(
                    this.columns.levels[:-1], this.columns.codes[:-1])
            ]))
        unique_groups = [key for key, _ in itertools.groupby(tuples)]
        new_names = this.columns.names[:-1]
        new_columns = MultiIndex.from_tuples(unique_groups, names=new_names)
    else:
        new_columns = unique_groups = this.columns.levels[0]

    # time to ravel the values
    new_data = {}
    level_vals = this.columns.levels[-1]
    level_codes = sorted(set(this.columns.codes[-1]))
    level_vals_used = level_vals[level_codes]
    levsize = len(level_codes)
    drop_cols = []
    for key in unique_groups:
        try:
            loc = this.columns.get_loc(key)
        except KeyError:
            drop_cols.append(key)
            continue

        # can make more efficient?
        # we almost always return a slice
        # but if unsorted can get a boolean
        # indexer
        if not isinstance(loc, slice):
            slice_len = len(loc)
        else:
            slice_len = loc.stop - loc.start

        if slice_len != levsize:
            chunk = this.loc[:, this.columns[loc]]
            chunk.columns = level_vals.take(chunk.columns.codes[-1])
            value_slice = chunk.reindex(columns=level_vals_used).values
        else:
            if (frame._is_homogeneous_type
                    and is_extension_array_dtype(frame.dtypes.iloc[0])):
                dtype = this[this.columns[loc]].dtypes.iloc[0]
                subset = this[this.columns[loc]]

                value_slice = dtype.construct_array_type()._concat_same_type(
                    [x._values for _, x in subset.iteritems()])
                N, K = this.shape
                idx = np.arange(N * K).reshape(K, N).T.ravel()
                value_slice = value_slice.take(idx)

            elif frame._is_mixed_type:
                value_slice = this[this.columns[loc]].values
            else:
                value_slice = this.values[:, loc]

        if value_slice.ndim > 1:
            # i.e. not extension
            value_slice = value_slice.ravel()

        new_data[key] = value_slice

    if len(drop_cols) > 0:
        new_columns = new_columns.difference(drop_cols)

    N = len(this)

    if isinstance(this.index, MultiIndex):
        new_levels = list(this.index.levels)
        new_names = list(this.index.names)
        new_codes = [lab.repeat(levsize) for lab in this.index.codes]
    else:
        new_levels = [this.index]
        new_codes = [np.arange(N).repeat(levsize)]
        new_names = [this.index.name]  # something better?

    new_levels.append(level_vals)
    new_codes.append(np.tile(level_codes, N))
    new_names.append(frame.columns.names[level_num])

    new_index = MultiIndex(levels=new_levels,
                           codes=new_codes,
                           names=new_names,
                           verify_integrity=False)

    result = frame._constructor(new_data, index=new_index, columns=new_columns)

    # more efficient way to go about this? can do the whole masking biz but
    # will only save a small amount of time...
    if dropna:
        result = result.dropna(axis=0, how='all')

    return result
Example #49
0
 def test_rename_mi(self):
     df = DataFrame([11, 21, 31],
                    index=MultiIndex.from_tuples([
                        ("A", x) for x in ["a", "B", "c"]
                    ]))
     result = df.rename(str.lower)
Example #50
0
def _stack_multi_columns(frame, level_num=-1, dropna=True):
    def _convert_level_number(level_num, columns):
        """
        Logic for converting the level number to something
        we can safely pass to swaplevel:

        We generally want to convert the level number into
        a level name, except when columns do not have names,
        in which case we must leave as a level number
        """
        if level_num in columns.names:
            return columns.names[level_num]
        else:
            if columns.names[level_num] is None:
                return level_num
            else:
                return columns.names[level_num]

    this = frame.copy()

    # this makes life much simpler
    if level_num != frame.columns.nlevels - 1:
        # roll levels to put selected level at end
        roll_columns = this.columns
        for i in range(level_num, frame.columns.nlevels - 1):
            # Need to check if the ints conflict with level names
            lev1 = _convert_level_number(i, roll_columns)
            lev2 = _convert_level_number(i + 1, roll_columns)
            roll_columns = roll_columns.swaplevel(lev1, lev2)
        this.columns = roll_columns

    if not this.columns.is_lexsorted():
        # Workaround the edge case where 0 is one of the column names,
        # which interferes with trying to sort based on the first
        # level
        level_to_sort = _convert_level_number(0, this.columns)
        this = this.sortlevel(level_to_sort, axis=1)

    # tuple list excluding level for grouping columns
    if len(frame.columns.levels) > 2:
        tuples = list(
            zip(*[
                lev.take(lab) for lev, lab in zip(this.columns.levels[:-1],
                                                  this.columns.labels[:-1])
            ]))
        unique_groups = [key for key, _ in itertools.groupby(tuples)]
        new_names = this.columns.names[:-1]
        new_columns = MultiIndex.from_tuples(unique_groups, names=new_names)
    else:
        new_columns = unique_groups = this.columns.levels[0]

    # time to ravel the values
    new_data = {}
    level_vals = this.columns.levels[-1]
    level_labels = sorted(set(this.columns.labels[-1]))
    level_vals_used = level_vals[level_labels]
    levsize = len(level_labels)
    drop_cols = []
    for key in unique_groups:
        loc = this.columns.get_loc(key)
        slice_len = loc.stop - loc.start
        # can make more efficient?

        if slice_len == 0:
            drop_cols.append(key)
            continue
        elif slice_len != levsize:
            chunk = this.ix[:, this.columns[loc]]
            chunk.columns = level_vals.take(chunk.columns.labels[-1])
            value_slice = chunk.reindex(columns=level_vals_used).values
        else:
            if frame._is_mixed_type:
                value_slice = this.ix[:, this.columns[loc]].values
            else:
                value_slice = this.values[:, loc]

        new_data[key] = value_slice.ravel()

    if len(drop_cols) > 0:
        new_columns = new_columns.difference(drop_cols)

    N = len(this)

    if isinstance(this.index, MultiIndex):
        new_levels = list(this.index.levels)
        new_names = list(this.index.names)
        new_labels = [lab.repeat(levsize) for lab in this.index.labels]
    else:
        new_levels = [this.index]
        new_labels = [np.arange(N).repeat(levsize)]
        new_names = [this.index.name]  # something better?

    new_levels.append(frame.columns.levels[level_num])
    new_labels.append(np.tile(level_labels, N))
    new_names.append(frame.columns.names[level_num])

    new_index = MultiIndex(levels=new_levels,
                           labels=new_labels,
                           names=new_names,
                           verify_integrity=False)

    result = DataFrame(new_data, index=new_index, columns=new_columns)

    # more efficient way to go about this? can do the whole masking biz but
    # will only save a small amount of time...
    if dropna:
        result = result.dropna(axis=0, how='all')

    return result
Example #51
0
def _flex_binary_moment(arg1, arg2, f, pairwise=False):

    if not (isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame))
            and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame))):
        raise TypeError("arguments to moment function must be of type "
                        "np.ndarray/Series/DataFrame")

    if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance(
            arg2, (np.ndarray, ABCSeries)):
        X, Y = _prep_binary(arg1, arg2)
        return f(X, Y)

    elif isinstance(arg1, ABCDataFrame):
        from pandas import DataFrame

        def dataframe_from_int_dict(data, frame_template):
            result = DataFrame(data, index=frame_template.index)
            if len(result.columns) > 0:
                result.columns = frame_template.columns[result.columns]
            return result

        results = {}
        if isinstance(arg2, ABCDataFrame):
            if pairwise is False:
                if arg1 is arg2:
                    # special case in order to handle duplicate column names
                    for i, col in enumerate(arg1.columns):
                        results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i])
                    return dataframe_from_int_dict(results, arg1)
                else:
                    if not arg1.columns.is_unique:
                        raise ValueError("'arg1' columns are not unique")
                    if not arg2.columns.is_unique:
                        raise ValueError("'arg2' columns are not unique")
                    with warnings.catch_warnings(record=True):
                        warnings.simplefilter("ignore", RuntimeWarning)
                        X, Y = arg1.align(arg2, join="outer")
                    X = X + 0 * Y
                    Y = Y + 0 * X

                    with warnings.catch_warnings(record=True):
                        warnings.simplefilter("ignore", RuntimeWarning)
                        res_columns = arg1.columns.union(arg2.columns)
                    for col in res_columns:
                        if col in X and col in Y:
                            results[col] = f(X[col], Y[col])
                    return DataFrame(results,
                                     index=X.index,
                                     columns=res_columns)
            elif pairwise is True:
                results = defaultdict(dict)
                for i, k1 in enumerate(arg1.columns):
                    for j, k2 in enumerate(arg2.columns):
                        if j < i and arg2 is arg1:
                            # Symmetric case
                            results[i][j] = results[j][i]
                        else:
                            results[i][j] = f(
                                *_prep_binary(arg1.iloc[:, i], arg2.iloc[:,
                                                                         j]))

                from pandas import concat

                result_index = arg1.index.union(arg2.index)
                if len(result_index):

                    # construct result frame
                    result = concat(
                        [
                            concat(
                                [
                                    results[i][j]
                                    for j, c in enumerate(arg2.columns)
                                ],
                                ignore_index=True,
                            ) for i, c in enumerate(arg1.columns)
                        ],
                        ignore_index=True,
                        axis=1,
                    )
                    result.columns = arg1.columns

                    # set the index and reorder
                    if arg2.columns.nlevels > 1:
                        result.index = MultiIndex.from_product(
                            arg2.columns.levels + [result_index])
                        result = result.reorder_levels([2, 0, 1]).sort_index()
                    else:
                        result.index = MultiIndex.from_product([
                            range(len(arg2.columns)),
                            range(len(result_index))
                        ])
                        result = result.swaplevel(1, 0).sort_index()
                        result.index = MultiIndex.from_product([result_index] +
                                                               [arg2.columns])
                else:

                    # empty result
                    result = DataFrame(
                        index=MultiIndex(levels=[arg1.index, arg2.columns],
                                         codes=[[], []]),
                        columns=arg2.columns,
                        dtype="float64",
                    )

                # reset our index names to arg1 names
                # reset our column names to arg2 names
                # careful not to mutate the original names
                result.columns = result.columns.set_names(arg1.columns.names)
                result.index = result.index.set_names(result_index.names +
                                                      arg2.columns.names)

                return result

            else:
                raise ValueError("'pairwise' is not True/False")
        else:
            results = {
                i: f(*_prep_binary(arg1.iloc[:, i], arg2))
                for i, col in enumerate(arg1.columns)
            }
            return dataframe_from_int_dict(results, arg1)

    else:
        return _flex_binary_moment(arg2, arg1, f)
Example #52
0
    def to_frame(self, filter_observations=True):
        """
        Transform wide format into long (stacked) format as DataFrame whose
        columns are the Panel's items and whose index is a MultiIndex formed
        of the Panel's major and minor axes.

        Parameters
        ----------
        filter_observations : boolean, default True
            Drop (major, minor) pairs without a complete set of observations
            across all the items

        Returns
        -------
        y : DataFrame
        """
        _, N, K = self.shape

        if filter_observations:
            # shaped like the return DataFrame
            mask = com.notnull(self.values).all(axis=0)
            # size = mask.sum()
            selector = mask.ravel()
        else:
            # size = N * K
            selector = slice(None, None)

        data = {}
        for item in self.items:
            data[item] = self[item].values.ravel()[selector]

        def construct_multi_parts(idx, n_repeat, n_shuffle=1):
            axis_idx = idx.to_hierarchical(n_repeat, n_shuffle)
            labels = [x[selector] for x in axis_idx.labels]
            levels = axis_idx.levels
            names = axis_idx.names
            return labels, levels, names

        def construct_index_parts(idx, major=True):
            levels = [idx]
            if major:
                labels = [np.arange(N).repeat(K)[selector]]
                names = idx.name or 'major'
            else:
                labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)]
                labels = [labels.ravel()[selector]]
                names = idx.name or 'minor'
            names = [names]
            return labels, levels, names

        if isinstance(self.major_axis, MultiIndex):
            major_labels, major_levels, major_names = construct_multi_parts(
                self.major_axis, n_repeat=K)
        else:
            major_labels, major_levels, major_names = construct_index_parts(
                self.major_axis)

        if isinstance(self.minor_axis, MultiIndex):
            minor_labels, minor_levels, minor_names = construct_multi_parts(
                self.minor_axis, n_repeat=N, n_shuffle=K)
        else:
            minor_labels, minor_levels, minor_names = construct_index_parts(
                self.minor_axis, major=False)

        levels = major_levels + minor_levels
        labels = major_labels + minor_labels
        names = major_names + minor_names

        index = MultiIndex(levels=levels, labels=labels,
                           names=names, verify_integrity=False)

        return DataFrame(data, index=index, columns=self.items)
Example #53
0
def pivot_table(
    data,
    values=None,
    index=None,
    columns=None,
    aggfunc="mean",
    fill_value=None,
    margins=False,
    dropna=True,
    margins_name="All",
    observed=False,
):
    index = _convert_by(index)
    columns = _convert_by(columns)

    if isinstance(aggfunc, list):
        pieces = []
        keys = []
        for func in aggfunc:
            table = pivot_table(
                data,
                values=values,
                index=index,
                columns=columns,
                fill_value=fill_value,
                aggfunc=func,
                margins=margins,
                dropna=dropna,
                margins_name=margins_name,
                observed=observed,
            )
            pieces.append(table)
            keys.append(getattr(func, "__name__", func))

        return concat(pieces, keys=keys, axis=1)

    keys = index + columns

    values_passed = values is not None
    if values_passed:
        if is_list_like(values):
            values_multi = True
            values = list(values)
        else:
            values_multi = False
            values = [values]

        # GH14938 Make sure value labels are in data
        for i in values:
            if i not in data:
                raise KeyError(i)

        to_filter = []
        for x in keys + values:
            if isinstance(x, Grouper):
                x = x.key
            try:
                if x in data:
                    to_filter.append(x)
            except TypeError:
                pass
        if len(to_filter) < len(data.columns):
            data = data[to_filter]

    else:
        values = data.columns
        for key in keys:
            try:
                values = values.drop(key)
            except (TypeError, ValueError, KeyError):
                pass
        values = list(values)

    grouped = data.groupby(keys, observed=observed)
    agged = grouped.agg(aggfunc)
    if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
        agged = agged.dropna(how="all")

        # gh-21133
        # we want to down cast if
        # the original values are ints
        # as we grouped with a NaN value
        # and then dropped, coercing to floats
        for v in values:
            if (v in data and is_integer_dtype(data[v]) and v in agged
                    and not is_integer_dtype(agged[v])):
                agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype)

    table = agged
    if table.index.nlevels > 1:
        # Related GH #17123
        # If index_names are integers, determine whether the integers refer
        # to the level position or name.
        index_names = agged.index.names[:len(index)]
        to_unstack = []
        for i in range(len(index), len(keys)):
            name = agged.index.names[i]
            if name is None or name in index_names:
                to_unstack.append(i)
            else:
                to_unstack.append(name)
        table = agged.unstack(to_unstack)

    if not dropna:
        if table.index.nlevels > 1:
            m = MultiIndex.from_arrays(cartesian_product(table.index.levels),
                                       names=table.index.names)
            table = table.reindex(m, axis=0)

        if table.columns.nlevels > 1:
            m = MultiIndex.from_arrays(cartesian_product(table.columns.levels),
                                       names=table.columns.names)
            table = table.reindex(m, axis=1)

    if isinstance(table, ABCDataFrame):
        table = table.sort_index(axis=1)

    if fill_value is not None:
        table = table.fillna(value=fill_value, downcast="infer")

    if margins:
        if dropna:
            data = data[data.notna().all(axis=1)]
        table = _add_margins(
            table,
            data,
            values,
            rows=index,
            cols=columns,
            aggfunc=aggfunc,
            observed=dropna,
            margins_name=margins_name,
            fill_value=fill_value,
        )

    # discard the top level
    if (values_passed and not values_multi and not table.empty
            and (table.columns.nlevels > 1)):
        table = table[values[0]]

    if len(index) == 0 and len(columns) > 0:
        table = table.T

    # GH 15193 Make sure empty columns are removed if dropna=True
    if isinstance(table, ABCDataFrame) and dropna:
        table = table.dropna(how="all", axis=1)

    return table
Example #54
0
    def __init__(
        self,
        partitions=None,
        index=None,
        columns=None,
        row_lengths=None,
        column_widths=None,
        dtypes=None,
        op=None,
        index_cols=None,
        uses_rowid=False,
        force_execution_mode=None,
    ):
        assert dtypes is not None

        self.id = str(type(self)._next_id[0])
        type(self)._next_id[0] += 1

        if index is not None:
            index = ensure_index(index)
        columns = ensure_index(columns)
        self._op = op
        self._index_cols = index_cols
        self._partitions = partitions
        self._index_cache = index
        self._columns_cache = columns
        self._row_lengths_cache = row_lengths
        self._column_widths_cache = column_widths
        if self._op is None:
            self._op = FrameNode(self)

        self._table_cols = columns.tolist()
        if self._index_cols is not None:
            self._table_cols = self._index_cols + self._table_cols

        assert len(dtypes) == len(
            self._table_cols
        ), f"unaligned dtypes ({dtypes}) and table columns ({self._table_cols})"
        if isinstance(dtypes, list):
            if self._index_cols is not None:
                # Table stores both index and data columns but those are accessed
                # differently if we have a MultiIndex for columns. To unify access
                # to dtype we extend index column names to tuples to have a MultiIndex
                # of dtypes.
                if isinstance(columns, MultiIndex):
                    tail = [""] * (columns.nlevels - 1)
                    index_tuples = [(col, *tail) for col in self._index_cols]
                    dtype_index = MultiIndex.from_tuples(index_tuples).append(
                        columns)
                    self._dtypes = pd.Series(dtypes, index=dtype_index)
                else:
                    self._dtypes = pd.Series(dtypes, index=self._table_cols)
            else:
                self._dtypes = pd.Series(dtypes, index=columns)
        else:
            self._dtypes = dtypes

        if partitions is not None:
            self._filter_empties()

        # This frame uses encoding for column names to support exotic
        # (e.g. non-string and reserved words) column names. Encoded
        # names are used in OmniSci tables and corresponding Arrow tables.
        # If we import Arrow table, we have to rename its columns for
        # proper processing.
        if self._has_arrow_table() and self._partitions.size > 0:
            assert self._partitions.size == 1
            table = self._partitions[0][0].get()
            if table.column_names[0] != f"F_{self._table_cols[0]}":
                new_names = [f"F_{col}" for col in table.column_names]
                new_table = table.rename_columns(new_names)
                self._partitions[0][
                    0] = self._frame_mgr_cls._partition_class.put_arrow(
                        new_table)

        self._uses_rowid = uses_rowid
        # Tests use forced execution mode to take control over frame
        # execution process. Supported values:
        #  "lazy" - RuntimeError is raised if execution is triggered for the frame
        #  "arrow" - RuntimeError is raised if execution is triggered, but we cannot
        #  execute it using Arrow API (have to use OmniSci for execution)
        self._force_execution_mode = force_execution_mode
Example #55
0
def stack(frame, level=-1, dropna=True):
    """
    Convert DataFrame to Series with multi-level Index. Columns become the
    second level of the resulting hierarchical index

    Returns
    -------
    stacked : Series
    """
    def factorize(index):
        if index.is_unique:
            return index, np.arange(len(index))
        codes, categories = _factorize_from_iterable(index)
        return categories, codes

    N, K = frame.shape

    # Will also convert negative level numbers and check if out of bounds.
    level_num = frame.columns._get_level_number(level)

    if isinstance(frame.columns, MultiIndex):
        return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)
    elif isinstance(frame.index, MultiIndex):
        new_levels = list(frame.index.levels)
        new_codes = [lab.repeat(K) for lab in frame.index.codes]

        clev, clab = factorize(frame.columns)
        new_levels.append(clev)
        new_codes.append(np.tile(clab, N).ravel())

        new_names = list(frame.index.names)
        new_names.append(frame.columns.name)
        new_index = MultiIndex(levels=new_levels,
                               codes=new_codes,
                               names=new_names,
                               verify_integrity=False)
    else:
        levels, (ilab,
                 clab) = zip(*map(factorize, (frame.index, frame.columns)))
        codes = ilab.repeat(K), np.tile(clab, N).ravel()
        new_index = MultiIndex(levels=levels,
                               codes=codes,
                               names=[frame.index.name, frame.columns.name],
                               verify_integrity=False)

    if frame._is_homogeneous_type:
        # For homogeneous EAs, frame.values will coerce to object. So
        # we concatenate instead.
        dtypes = list(frame.dtypes.values)
        dtype = dtypes[0]

        if is_extension_array_dtype(dtype):
            arr = dtype.construct_array_type()
            new_values = arr._concat_same_type(
                [col._values for _, col in frame.iteritems()])
            new_values = _reorder_for_extension_array_stack(new_values, N, K)
        else:
            # homogeneous, non-EA
            new_values = frame.values.ravel()

    else:
        # non-homogeneous
        new_values = frame.values.ravel()

    if dropna:
        mask = notna(new_values)
        new_values = new_values[mask]
        new_index = new_index[mask]

    return frame._constructor_sliced(new_values, index=new_index)
Example #56
0
def _make_hierarchical_index(index, names):
    return MultiIndex.from_tuples(*[index], names=names)
Example #57
0
def _unstack_multiple(data, clocs):
    from pandas.core.groupby import decons_obs_group_ids

    if len(clocs) == 0:
        return data

    # NOTE: This doesn't deal with hierarchical columns yet

    index = data.index

    clocs = [index._get_level_number(i) for i in clocs]

    rlocs = [i for i in range(index.nlevels) if i not in clocs]

    clevels = [index.levels[i] for i in clocs]
    clabels = [index.labels[i] for i in clocs]
    cnames = [index.names[i] for i in clocs]
    rlevels = [index.levels[i] for i in rlocs]
    rlabels = [index.labels[i] for i in rlocs]
    rnames = [index.names[i] for i in rlocs]

    shape = [len(x) for x in clevels]
    group_index = get_group_index(clabels, shape, sort=False, xnull=False)

    comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
    recons_labels = decons_obs_group_ids(comp_ids,
                                         obs_ids,
                                         shape,
                                         clabels,
                                         xnull=False)

    dummy_index = MultiIndex(levels=rlevels + [obs_ids],
                             labels=rlabels + [comp_ids],
                             names=rnames + ['__placeholder__'],
                             verify_integrity=False)

    if isinstance(data, Series):
        dummy = Series(data.values, index=dummy_index)
        unstacked = dummy.unstack('__placeholder__')
        new_levels = clevels
        new_names = cnames
        new_labels = recons_labels
    else:
        if isinstance(data.columns, MultiIndex):
            result = data
            for i in range(len(clocs)):
                val = clocs[i]
                result = result.unstack(val)
                clocs = [val if i > val else val - 1 for val in clocs]

            return result

        dummy = DataFrame(data.values, index=dummy_index, columns=data.columns)

        unstacked = dummy.unstack('__placeholder__')
        if isinstance(unstacked, Series):
            unstcols = unstacked.index
        else:
            unstcols = unstacked.columns
        new_levels = [unstcols.levels[0]] + clevels
        new_names = [data.columns.name] + cnames

        new_labels = [unstcols.labels[0]]
        for rec in recons_labels:
            new_labels.append(rec.take(unstcols.labels[-1]))

    new_columns = MultiIndex(levels=new_levels,
                             labels=new_labels,
                             names=new_names,
                             verify_integrity=False)

    if isinstance(unstacked, Series):
        unstacked.index = new_columns
    else:
        unstacked.columns = new_columns

    return unstacked
Example #58
0
def pivot_table(data,
                values=None,
                index=None,
                columns=None,
                aggfunc='mean',
                fill_value=None,
                margins=False,
                dropna=True):
    """
    Create a spreadsheet-style pivot table as a DataFrame. The levels in the
    pivot table will be stored in MultiIndex objects (hierarchical indexes) on
    the index and columns of the result DataFrame

    Parameters
    ----------
    data : DataFrame
    values : column to aggregate, optional
    index : a column, Grouper, array which has the same length as data, or list of them.
        Keys to group by on the pivot table index.
        If an array is passed, it is being used as the same manner as column values.
    columns : a column, Grouper, array which has the same length as data, or list of them.
        Keys to group by on the pivot table column.
        If an array is passed, it is being used as the same manner as column values.
    aggfunc : function, default numpy.mean, or list of functions
        If list of functions passed, the resulting pivot table will have
        hierarchical columns whose top level are the function names (inferred
        from the function objects themselves)
    fill_value : scalar, default None
        Value to replace missing values with
    margins : boolean, default False
        Add all row / columns (e.g. for subtotal / grand totals)
    dropna : boolean, default True
        Do not include columns whose entries are all NaN
    rows : kwarg only alias of index [deprecated]
    cols : kwarg only alias of columns [deprecated]

    Examples
    --------
    >>> df
       A   B   C      D
    0  foo one small  1
    1  foo one large  2
    2  foo one large  2
    3  foo two small  3
    4  foo two small  3
    5  bar one large  4
    6  bar one small  5
    7  bar two small  6
    8  bar two large  7

    >>> table = pivot_table(df, values='D', index=['A', 'B'],
    ...                     columns=['C'], aggfunc=np.sum)
    >>> table
              small  large
    foo  one  1      4
         two  6      NaN
    bar  one  5      4
         two  6      7

    Returns
    -------
    table : DataFrame
    """
    index = _convert_by(index)
    columns = _convert_by(columns)

    if isinstance(aggfunc, list):
        pieces = []
        keys = []
        for func in aggfunc:
            table = pivot_table(data,
                                values=values,
                                index=index,
                                columns=columns,
                                fill_value=fill_value,
                                aggfunc=func,
                                margins=margins)
            pieces.append(table)
            keys.append(func.__name__)
        return concat(pieces, keys=keys, axis=1)

    keys = index + columns

    values_passed = values is not None
    if values_passed:
        if isinstance(values, (list, tuple)):
            values_multi = True
        else:
            values_multi = False
            values = [values]
    else:
        values = list(data.columns.drop(keys))

    if values_passed:
        to_filter = []
        for x in keys + values:
            if isinstance(x, Grouper):
                x = x.key
            try:
                if x in data:
                    to_filter.append(x)
            except TypeError:
                pass
        if len(to_filter) < len(data.columns):
            data = data[to_filter]

    grouped = data.groupby(keys)
    agged = grouped.agg(aggfunc)

    table = agged
    if table.index.nlevels > 1:
        to_unstack = [
            agged.index.names[i] or i for i in range(len(index), len(keys))
        ]
        table = agged.unstack(to_unstack)

    if not dropna:
        try:
            m = MultiIndex.from_arrays(cartesian_product(table.index.levels))
            table = table.reindex_axis(m, axis=0)
        except AttributeError:
            pass  # it's a single level

        try:
            m = MultiIndex.from_arrays(cartesian_product(table.columns.levels))
            table = table.reindex_axis(m, axis=1)
        except AttributeError:
            pass  # it's a single level or a series

    if isinstance(table, DataFrame):
        if isinstance(table.columns, MultiIndex):
            table = table.sortlevel(axis=1)
        else:
            table = table.sort_index(axis=1)

    if fill_value is not None:
        table = table.fillna(value=fill_value, downcast='infer')

    if margins:
        table = _add_margins(table,
                             data,
                             values,
                             rows=index,
                             cols=columns,
                             aggfunc=aggfunc)

    # discard the top level
    if values_passed and not values_multi:
        table = table[values[0]]

    if len(index) == 0 and len(columns) > 0:
        table = table.T

    return table
Example #59
0
    def get_chunk(self, rows=None):
        if rows is not None and self.skip_footer:
            raise ValueError('skip_footer not supported for iteration')

        try:
            content = self._get_lines(rows)
        except StopIteration:
            if self._first_chunk:
                content = []
            else:
                raise

        # done with first read, next time raise StopIteration
        self._first_chunk = False

        if len(content) == 0: # pragma: no cover
            if self.index_col is not None:
                if np.isscalar(self.index_col):
                    index = Index([], name=self.index_name)
                else:
                    index = MultiIndex.from_arrays([[]] * len(self.index_col),
                                                   names=self.index_name)
            else:
                index = Index([])

            return DataFrame(index=index, columns=self.columns)

        zipped_content = list(lib.to_object_array(content).T)

        # no index column specified, so infer that's what is wanted
        if self.index_col is not None:
            if np.isscalar(self.index_col):
                index = zipped_content.pop(self.index_col)
            else: # given a list of index
                index = []
                for idx in self.index_col:
                    index.append(zipped_content[idx])
                # remove index items from content and columns, don't pop in loop
                for i in reversed(sorted(self.index_col)):
                    zipped_content.pop(i)

            if np.isscalar(self.index_col):
                if self.parse_dates:
                    index = lib.try_parse_dates(index, parser=self.date_parser)
                index, na_count = _convert_types(index, self.na_values)
                index = Index(index, name=self.index_name)
                if self.verbose and na_count:
                    print 'Found %d NA values in the index' % na_count
            else:
                arrays = []
                for arr in index:
                    if self.parse_dates:
                        arr = lib.try_parse_dates(arr, parser=self.date_parser)
                    arr, _ = _convert_types(arr, self.na_values)
                    arrays.append(arr)
                index = MultiIndex.from_arrays(arrays, names=self.index_name)
        else:
            index = Index(np.arange(len(content)))

        if not index._verify_integrity():
            dups = index.get_duplicates()
            err_msg = 'Tried columns 1-X as index but found duplicates %s'
            raise Exception(err_msg % str(dups))

        if len(self.columns) != len(zipped_content):
            raise Exception('wrong number of columns')

        data = dict((k, v) for k, v in izip(self.columns, zipped_content))

        # apply converters
        for col, f in self.converters.iteritems():
            if isinstance(col, int) and col not in self.columns:
                col = self.columns[col]
            data[col] = lib.map_infer(data[col], f)

        data = _convert_to_ndarrays(data, self.na_values, self.verbose)

        return DataFrame(data=data, columns=self.columns, index=index)
Example #60
0
File: merge.py Project: lahi/pandas
def _make_concat_multiindex(indexes, keys, levels=None, names=None):
    if ((levels is None and isinstance(keys[0], tuple))
            or (levels is not None and len(levels) > 1)):
        zipped = zip(*keys)
        if names is None:
            names = [None] * len(zipped)

        if levels is None:
            levels = [Factor(zp).levels for zp in zipped]
        else:
            levels = [_ensure_index(x) for x in levels]
    else:
        zipped = [keys]
        if names is None:
            names = [None]

        if levels is None:
            levels = [_ensure_index(keys)]
        else:
            levels = [_ensure_index(x) for x in levels]

    if not _all_indexes_same(indexes):
        label_list = []

        # things are potentially different sizes, so compute the exact labels
        # for each level and pass those to MultiIndex.from_arrays

        for hlevel, level in zip(zipped, levels):
            to_concat = []
            for key, index in zip(hlevel, indexes):
                i = level.get_loc(key)
                to_concat.append(np.repeat(i, len(index)))
            label_list.append(np.concatenate(to_concat))

        concat_index = _concat_indexes(indexes)

        # these go at the end
        if isinstance(concat_index, MultiIndex):
            levels.extend(concat_index.levels)
            label_list.extend(concat_index.labels)
        else:
            factor = Factor(concat_index)
            levels.append(factor.levels)
            label_list.append(factor.labels)

        # also copies
        names = names + _get_consensus_names(indexes)

        return MultiIndex(levels=levels, labels=label_list, names=names)

    new_index = indexes[0]
    n = len(new_index)
    kpieces = len(indexes)

    # also copies
    new_names = list(names)
    new_levels = list(levels)

    # construct labels
    new_labels = []

    # do something a bit more speedy

    for hlevel, level in zip(zipped, levels):
        mapped = level.get_indexer(hlevel)
        new_labels.append(np.repeat(mapped, n))

    if isinstance(new_index, MultiIndex):
        new_levels.extend(new_index.levels)
        new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels])
        new_names.extend(new_index.names)
    else:
        new_levels.append(new_index)
        new_names.append(new_index.name)
        new_labels.append(np.tile(np.arange(n), kpieces))

    return MultiIndex(levels=new_levels, labels=new_labels, names=new_names)