Ejemplo n.º 1
0
    def ols_results(self):
        """
        Returns the results of the regressions:
        x_1 ~ L(X)
        x_2 ~ L(X)
        ...
        x_k ~ L(X)

        where X = [x_1, x_2, ..., x_k]
        and L(X) represents the columns of X lagged 1, 2, ..., n lags
        (n is the user-provided number of lags).

        Returns
        -------
        dict
        """
        from pandas.stats.api import ols

        d = {}
        for i in range(1, 1 + self._p):
            for col, series in compat.iteritems(self._lagged_data[i]):
                d[_make_param_name(i, col)] = series

        result = dict([(col, ols(y=y, x=d, intercept=self._intercept))
                       for col, y in compat.iteritems(self._data)])

        return result
Ejemplo n.º 2
0
def stack_sparse_frame(frame):
    """
    Only makes sense when fill_value is NaN
    """
    lengths = [s.sp_index.npoints for _, s in compat.iteritems(frame)]
    nobs = sum(lengths)

    # this is pretty fast
    minor_labels = np.repeat(np.arange(len(frame.columns)), lengths)

    inds_to_concat = []
    vals_to_concat = []
    # TODO: Figure out whether this can be reached.
    # I think this currently can't be reached because you can't build a SparseDataFrame
    # with a non-np.NaN fill value (fails earlier).
    for _, series in compat.iteritems(frame):
        if not np.isnan(series.fill_value):
            raise TypeError('This routine assumes NaN fill value')

        int_index = series.sp_index.to_int_index()
        inds_to_concat.append(int_index.indices)
        vals_to_concat.append(series.sp_values)

    major_labels = np.concatenate(inds_to_concat)
    stacked_values = np.concatenate(vals_to_concat)
    index = MultiIndex(levels=[frame.index, frame.columns],
                       labels=[major_labels, minor_labels],
                       verify_integrity=False)

    lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index,
                   columns=['foo'])
    return lp.sortlevel(level=0)
Ejemplo n.º 3
0
    def test_replace_input_formats_listlike(self):
        # both dicts
        to_rep = {'A': np.nan, 'B': 0, 'C': ''}
        values = {'A': 0, 'B': -1, 'C': 'missing'}
        df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5],
                        'C': ['', 'asdf', 'fd']})
        filled = df.replace(to_rep, values)
        expected = {k: v.replace(to_rep[k], values[k])
                    for k, v in compat.iteritems(df)}
        assert_frame_equal(filled, DataFrame(expected))

        result = df.replace([0, 2, 5], [5, 2, 0])
        expected = DataFrame({'A': [np.nan, 5, np.inf], 'B': [5, 2, 0],
                              'C': ['', 'asdf', 'fd']})
        assert_frame_equal(result, expected)

        # scalar to dict
        values = {'A': 0, 'B': -1, 'C': 'missing'}
        df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5],
                        'C': ['', 'asdf', 'fd']})
        filled = df.replace(np.nan, values)
        expected = {k: v.replace(np.nan, values[k])
                    for k, v in compat.iteritems(df)}
        assert_frame_equal(filled, DataFrame(expected))

        # list to list
        to_rep = [np.nan, 0, '']
        values = [-2, -1, 'missing']
        result = df.replace(to_rep, values)
        expected = df.copy()
        for i in range(len(to_rep)):
            expected.replace(to_rep[i], values[i], inplace=True)
        assert_frame_equal(result, expected)

        pytest.raises(ValueError, df.replace, to_rep, values[1:])
Ejemplo n.º 4
0
    def test_reindex(self):
        newFrame = self.frame.reindex(self.ts1.index)

        for col in newFrame.columns:
            for idx, val in compat.iteritems(newFrame[col]):
                if idx in self.frame.index:
                    if np.isnan(val):
                        self.assertTrue(np.isnan(self.frame[col][idx]))
                    else:
                        self.assertEqual(val, self.frame[col][idx])
                else:
                    self.assertTrue(np.isnan(val))

        for col, series in compat.iteritems(newFrame):
            self.assertTrue(tm.equalContents(series.index, newFrame.index))
        emptyFrame = self.frame.reindex(Index([]))
        self.assertEqual(len(emptyFrame.index), 0)

        # Cython code should be unit-tested directly
        nonContigFrame = self.frame.reindex(self.ts1.index[::2])

        for col in nonContigFrame.columns:
            for idx, val in compat.iteritems(nonContigFrame[col]):
                if idx in self.frame.index:
                    if np.isnan(val):
                        self.assertTrue(np.isnan(self.frame[col][idx]))
                    else:
                        self.assertEqual(val, self.frame[col][idx])
                else:
                    self.assertTrue(np.isnan(val))

        for col, series in compat.iteritems(nonContigFrame):
            self.assertTrue(tm.equalContents(series.index,
                                             nonContigFrame.index))

        # corner cases

        # Same index, copies values but not index if copy=False
        newFrame = self.frame.reindex(self.frame.index, copy=False)
        self.assertIs(newFrame.index, self.frame.index)

        # length zero
        newFrame = self.frame.reindex([])
        self.assertTrue(newFrame.empty)
        self.assertEqual(len(newFrame.columns), len(self.frame.columns))

        # length zero with columns reindexed with non-empty index
        newFrame = self.frame.reindex([])
        newFrame = newFrame.reindex(self.frame.index)
        self.assertEqual(len(newFrame.index), len(self.frame.index))
        self.assertEqual(len(newFrame.columns), len(self.frame.columns))

        # pass non-Index
        newFrame = self.frame.reindex(list(self.ts1.index))
        self.assert_index_equal(newFrame.index, self.ts1.index)

        # copy with no axes
        result = self.frame.reindex()
        assert_frame_equal(result, self.frame)
        self.assertFalse(result is self.frame)
Ejemplo n.º 5
0
def _convert_frames(frames, index, columns, fill_value=np.nan, kind='block'):
    from pandas.core.panel import _get_combined_index
    output = {}
    for item, df in compat.iteritems(frames):
        if not isinstance(df, SparseDataFrame):
            df = SparseDataFrame(df, default_kind=kind,
                                 default_fill_value=fill_value)

        output[item] = df

    if index is None:
        all_indexes = [df.index for df in output.values()]
        index = _get_combined_index(all_indexes)
    if columns is None:
        all_columns = [df.columns for df in output.values()]
        columns = _get_combined_index(all_columns)

    index = _ensure_index(index)
    columns = _ensure_index(columns)

    for item, df in compat.iteritems(output):
        if not (df.index.equals(index) and df.columns.equals(columns)):
            output[item] = df.reindex(index=index, columns=columns)

    return output, index, columns
Ejemplo n.º 6
0
 def _update_method_mapper(cls, mapper):
     """Attach cls._method_mapper to passed mapper"""
     if cls._method_mapper is None:
         return mapper
     for key, class_dict in compat.iteritems(cls._method_mapper):
         # mapping method_name to actual class method
         class_dict = {k: getattr(cls, m) for k, m in compat.iteritems(class_dict)}
         mapper[key] = dict(mapper[key], **class_dict)
     return mapper
Ejemplo n.º 7
0
    def test_iteritems(self):
        for idx, val in compat.iteritems(self.series):
            assert val == self.series[idx]

        for idx, val in compat.iteritems(self.ts):
            assert val == self.ts[idx]

        # assert is lazy (genrators don't define reverse, lists do)
        assert not hasattr(self.series.iteritems(), 'reverse')
Ejemplo n.º 8
0
    def test_iteritems(self):
        for idx, val in compat.iteritems(self.series):
            self.assertEqual(val, self.series[idx])

        for idx, val in compat.iteritems(self.ts):
            self.assertEqual(val, self.ts[idx])

        # assert is lazy (genrators don't define reverse, lists do)
        self.assertFalse(hasattr(self.series.iteritems(), 'reverse'))
Ejemplo n.º 9
0
def int_frame():
    """
    Fixture for DataFrame of ints with index of unique strings

    Columns are ['A', 'B', 'C', 'D']
    """
    df = DataFrame({k: v.astype(int)
                   for k, v in compat.iteritems(tm.getSeriesData())})
    # force these all to int64 to avoid platform testing issues
    return DataFrame({c: s for c, s in compat.iteritems(df)}, dtype=np.int64)
Ejemplo n.º 10
0
    def test_apply_differently_indexed(self):
        df = DataFrame(np.random.randn(20, 10))

        result0 = df.apply(Series.describe, axis=0)
        expected0 = DataFrame(dict((i, v.describe()) for i, v in compat.iteritems(df)), columns=df.columns)
        assert_frame_equal(result0, expected0)

        result1 = df.apply(Series.describe, axis=1)
        expected1 = DataFrame(dict((i, v.describe()) for i, v in compat.iteritems(df.T)), columns=df.index).T
        assert_frame_equal(result1, expected1)
Ejemplo n.º 11
0
    def test_to_dict(self, mapping):
        test_data = {
            'A': {'1': 1, '2': 2},
            'B': {'1': '1', '2': '2', '3': '3'},
        }

        # GH16122
        recons_data = DataFrame(test_data).to_dict(into=mapping)

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k][k2])

        recons_data = DataFrame(test_data).to_dict("l", mapping)

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k][int(k2) - 1])

        recons_data = DataFrame(test_data).to_dict("s", mapping)

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k][k2])

        recons_data = DataFrame(test_data).to_dict("sp", mapping)
        expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'],
                          'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]}
        tm.assert_dict_equal(recons_data, expected_split)

        recons_data = DataFrame(test_data).to_dict("r", mapping)
        expected_records = [{'A': 1.0, 'B': '1'},
                            {'A': 2.0, 'B': '2'},
                            {'A': np.nan, 'B': '3'}]
        assert isinstance(recons_data, list)
        assert (len(recons_data) == 3)
        for l, r in zip(recons_data, expected_records):
            tm.assert_dict_equal(l, r)

        # GH10844
        recons_data = DataFrame(test_data).to_dict("i")

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k2][k])

        df = DataFrame(test_data)
        df['duped'] = df[df.columns[0]]
        recons_data = df.to_dict("i")
        comp_data = test_data.copy()
        comp_data['duped'] = comp_data[df.columns[0]]
        for k, v in compat.iteritems(comp_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k2][k])
Ejemplo n.º 12
0
    def most_common(self, n=None):
        """List the n most common elements and their counts from the most
        common to the least.  If n is None, then list all element counts.

        >>> Counter('abcdeabcdabcaba').most_common(3)
        [('a', 5), ('b', 4), ('c', 3)]

        """
        # Emulate Bag.sortedByCount from Smalltalk
        if n is None:
            return sorted(compat.iteritems(self), key=_itemgetter(1), reverse=True)
        return _heapq.nlargest(n, compat.iteritems(self), key=_itemgetter(1))
Ejemplo n.º 13
0
def percentileRank(frame, column=None, kind='mean'):
    """
    Return score at percentile for each point in time (cross-section)

    Parameters
    ----------
    frame: DataFrame
    column: string or Series, optional
       Column name or specific Series to compute percentiles for.
       If not provided, percentiles are computed for all values at each
       point in time. Note that this can take a LONG time.
    kind: {'rank', 'weak', 'strict', 'mean'}, optional
        This optional parameter specifies the interpretation of the
        resulting score:

        - "rank": Average percentage ranking of score.  In case of
                  multiple matches, average the percentage rankings of
                  all matching scores.
        - "weak": This kind corresponds to the definition of a cumulative
                  distribution function.  A percentileofscore of 80%
                  means that 80% of values are less than or equal
                  to the provided score.
        - "strict": Similar to "weak", except that only values that are
                    strictly less than the given score are counted.
        - "mean": The average of the "weak" and "strict" scores, often used in
                  testing.  See

                  http://en.wikipedia.org/wiki/Percentile_rank

    Returns
    -------
    TimeSeries or DataFrame, depending on input
    """
    from pandas.compat.scipy import percentileofscore
    fun = lambda xs, score: percentileofscore(remove_na(xs),
                                              score, kind=kind)

    results = {}
    framet = frame.T
    if column is not None:
        if isinstance(column, Series):
            for date, xs in compat.iteritems(frame.T):
                results[date] = fun(xs, column.get(date, NaN))
        else:
            for date, xs in compat.iteritems(frame.T):
                results[date] = fun(xs, xs[column])
        results = Series(results)
    else:
        for column in frame.columns:
            for date, xs in compat.iteritems(framet):
                results.setdefault(date, {})[column] = fun(xs, xs[column])
        results = DataFrame(results).T
    return results
Ejemplo n.º 14
0
    def test_map(self):
        index, data = tm.getMixedTypeDict()

        source = Series(data['B'], index=data['C'])
        target = Series(data['C'][:4], index=data['D'][:4])

        merged = target.map(source)

        for k, v in compat.iteritems(merged):
            assert v == source[target[k]]

        # input could be a dict
        merged = target.map(source.to_dict())

        for k, v in compat.iteritems(merged):
            assert v == source[target[k]]

        # function
        result = self.ts.map(lambda x: x * 2)
        tm.assert_series_equal(result, self.ts * 2)

        # GH 10324
        a = Series([1, 2, 3, 4])
        b = Series(["even", "odd", "even", "odd"], dtype="category")
        c = Series(["even", "odd", "even", "odd"])

        exp = Series(["odd", "even", "odd", np.nan], dtype="category")
        tm.assert_series_equal(a.map(b), exp)
        exp = Series(["odd", "even", "odd", np.nan])
        tm.assert_series_equal(a.map(c), exp)

        a = Series(['a', 'b', 'c', 'd'])
        b = Series([1, 2, 3, 4],
                   index=pd.CategoricalIndex(['b', 'c', 'd', 'e']))
        c = Series([1, 2, 3, 4], index=Index(['b', 'c', 'd', 'e']))

        exp = Series([np.nan, 1, 2, 3])
        tm.assert_series_equal(a.map(b), exp)
        exp = Series([np.nan, 1, 2, 3])
        tm.assert_series_equal(a.map(c), exp)

        a = Series(['a', 'b', 'c', 'd'])
        b = Series(['B', 'C', 'D', 'E'], dtype='category',
                   index=pd.CategoricalIndex(['b', 'c', 'd', 'e']))
        c = Series(['B', 'C', 'D', 'E'], index=Index(['b', 'c', 'd', 'e']))

        exp = Series(pd.Categorical([np.nan, 'B', 'C', 'D'],
                                    categories=['B', 'C', 'D', 'E']))
        tm.assert_series_equal(a.map(b), exp)
        exp = Series([np.nan, 'B', 'C', 'D'])
        tm.assert_series_equal(a.map(c), exp)
Ejemplo n.º 15
0
    def _init_dict(self, data, index, columns, dtype=None):
        # pre-filter out columns if we passed it
        if columns is not None:
            columns = ensure_index(columns)
            data = {k: v for k, v in compat.iteritems(data) if k in columns}
        else:
            keys = com.dict_keys_to_ordered_list(data)
            columns = Index(keys)

        if index is None:
            index = extract_index(list(data.values()))

        def sp_maker(x):
            return SparseArray(x, kind=self._default_kind,
                               fill_value=self._default_fill_value,
                               copy=True, dtype=dtype)
        sdict = {}
        for k, v in compat.iteritems(data):
            if isinstance(v, Series):
                # Force alignment, no copy necessary
                if not v.index.equals(index):
                    v = v.reindex(index)

                if not isinstance(v, SparseSeries):
                    v = sp_maker(v.values)
            elif isinstance(v, SparseArray):
                v = v.copy()
            else:
                if isinstance(v, dict):
                    v = [v.get(i, np.nan) for i in index]

                v = sp_maker(v)

            if index is not None and len(v) != len(index):
                msg = "Length of passed values is {}, index implies {}"
                raise ValueError(msg.format(len(v), len(index)))
            sdict[k] = v

        if len(columns.difference(sdict)):
            # TODO: figure out how to handle this case, all nan's?
            # add in any other columns we want to have (completeness)
            nan_arr = np.empty(len(index), dtype='float64')
            nan_arr.fill(np.nan)
            nan_arr = SparseArray(nan_arr, kind=self._default_kind,
                                  fill_value=self._default_fill_value,
                                  copy=False)
            sdict.update((c, nan_arr) for c in columns if c not in sdict)

        return to_manager(sdict, columns, index)
Ejemplo n.º 16
0
    def test_groups(self, df):
        grouped = df.groupby(['A'])
        groups = grouped.groups
        assert groups is grouped.groups  # caching works

        for k, v in compat.iteritems(grouped.groups):
            assert (df.loc[v]['A'] == k).all()

        grouped = df.groupby(['A', 'B'])
        groups = grouped.groups
        assert groups is grouped.groups  # caching works

        for k, v in compat.iteritems(grouped.groups):
            assert (df.loc[v]['A'] == k[0]).all()
            assert (df.loc[v]['B'] == k[1]).all()
Ejemplo n.º 17
0
        def f(values, axis=None, skipna=True, **kwds):
            if len(self.kwargs) > 0:
                for k, v in compat.iteritems(self.kwargs):
                    if k not in kwds:
                        kwds[k] = v
            try:
                if self.zero_value is not None and values.size == 0:
                    if values.ndim == 1:
                        return 0
                    else:
                        result_shape = (values.shape[:axis] +
                                        values.shape[axis + 1:])
                        result = np.empty(result_shape)
                        result.fill(0)
                        return result

                if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype):
                    result = bn_func(values, axis=axis, **kwds)

                    # prefer to treat inf/-inf as NA, but must compute the func
                    # twice :(
                    if _has_infs(result):
                        result = alt(values, axis=axis, skipna=skipna, **kwds)
                else:
                    result = alt(values, axis=axis, skipna=skipna, **kwds)
            except Exception:
                result = alt(values, axis=axis, skipna=skipna, **kwds)

            return result
Ejemplo n.º 18
0
    def apply(self, func, axis=0, broadcast=False):
        """
        Analogous to DataFrame.apply, for SparseDataFrame

        Parameters
        ----------
        func : function
            Function to apply to each column
        axis : {0, 1}
        broadcast : bool, default False
            For aggregation functions, return object of same size with values
            propagated

        Returns
        -------
        applied : Series or SparseDataFrame
        """
        if not len(self.columns):
            return self

        if isinstance(func, np.ufunc):
            new_series = {}
            for k, v in compat.iteritems(self):
                applied = func(v)
                applied.fill_value = func(applied.fill_value)
                new_series[k] = applied
            return SparseDataFrame(new_series, index=self.index,
                                   columns=self.columns,
                                   default_fill_value=self.default_fill_value,
                                   default_kind=self.default_kind)
        else:
            if not broadcast:
                return self._apply_standard(func, axis)
            else:
                return self._apply_broadcast(func, axis)
Ejemplo n.º 19
0
    def test_delete_slice(self):
        idx = timedelta_range(start='1 days', periods=10, freq='D', name='idx')

        # prserve freq
        expected_0_2 = timedelta_range(start='4 days', periods=7, freq='D',
                                       name='idx')
        expected_7_9 = timedelta_range(start='1 days', periods=7, freq='D',
                                       name='idx')

        # reset freq to None
        expected_3_5 = TimedeltaIndex(['1 d', '2 d', '3 d',
                                       '7 d', '8 d', '9 d', '10d'],
                                      freq=None, name='idx')

        cases = {(0, 1, 2): expected_0_2,
                 (7, 8, 9): expected_7_9,
                 (3, 4, 5): expected_3_5}
        for n, expected in compat.iteritems(cases):
            result = idx.delete(n)
            tm.assert_index_equal(result, expected)
            assert result.name == expected.name
            assert result.freq == expected.freq

            result = idx.delete(slice(n[0], n[-1] + 1))
            tm.assert_index_equal(result, expected)
            assert result.name == expected.name
            assert result.freq == expected.freq
Ejemplo n.º 20
0
    def _combine_match_index(self, other, func, level=None, fill_value=None):
        new_data = {}

        if fill_value is not None:
            raise NotImplementedError
        if level is not None:
            raise NotImplementedError

        new_index = self.index.union(other.index)
        this = self
        if self.index is not new_index:
            this = self.reindex(new_index)

        if other.index is not new_index:
            other = other.reindex(new_index)

        for col, series in compat.iteritems(this):
            new_data[col] = func(series.values, other.values)

        # fill_value is a function of our operator
        if isnull(other.fill_value) or isnull(self.default_fill_value):
            fill_value = np.nan
        else:
            fill_value = func(np.float64(self.default_fill_value),
                              np.float64(other.fill_value))

        return self._constructor(new_data,
                                 index=new_index,
                                 columns=self.columns,
                                 default_fill_value=fill_value,
                                 fill_value=self.default_fill_value).__finalize__(self)
Ejemplo n.º 21
0
    def _combine_const(self, other, func):
        new_data = {}
        for col, series in compat.iteritems(self):
            new_data[col] = func(series, other)

        return self._constructor(data=new_data, index=self.index,
                                 columns=self.columns)
Ejemplo n.º 22
0
    def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
                       limit=None, takeable=False):
        if level is not None:
            raise TypeError('Reindex by level not supported for sparse')

        if self.index.equals(index):
            if copy:
                return self.copy()
            else:
                return self

        if len(self.index) == 0:
            return SparseDataFrame(index=index, columns=self.columns)

        indexer = self.index.get_indexer(index, method, limit=limit)
        indexer = com._ensure_platform_int(indexer)
        mask = indexer == -1
        need_mask = mask.any()

        new_series = {}
        for col, series in compat.iteritems(self):
            values = series.values
            new = values.take(indexer)

            if need_mask:
                np.putmask(new, mask, fill_value)

            new_series[col] = new

        return SparseDataFrame(new_series, index=index, columns=self.columns,
                               default_fill_value=self.default_fill_value)
Ejemplo n.º 23
0
    def test_delete(self):
        idx = timedelta_range(start='1 Days', periods=5, freq='D', name='idx')

        # prserve freq
        expected_0 = timedelta_range(start='2 Days', periods=4, freq='D',
                                     name='idx')
        expected_4 = timedelta_range(start='1 Days', periods=4, freq='D',
                                     name='idx')

        # reset freq to None
        expected_1 = TimedeltaIndex(
            ['1 day', '3 day', '4 day', '5 day'], freq=None, name='idx')

        cases = {0: expected_0,
                 -5: expected_0,
                 -1: expected_4,
                 4: expected_4,
                 1: expected_1}
        for n, expected in compat.iteritems(cases):
            result = idx.delete(n)
            tm.assert_index_equal(result, expected)
            assert result.name == expected.name
            assert result.freq == expected.freq

        with pytest.raises((IndexError, ValueError)):
            # either depeidnig on numpy version
            result = idx.delete(5)
Ejemplo n.º 24
0
    def checkMovingOLS(self, x, y, window_type='rolling', **kwds):
        window = 25  # must be larger than rank of x

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            moving = ols(y=y, x=x, window_type=window_type,
                         window=window, **kwds)

        index = moving._index

        for n, i in enumerate(moving._valid_indices):
            if window_type == 'rolling' and i >= window:
                prior_date = index[i - window + 1]
            else:
                prior_date = index[0]

            date = index[i]

            x_iter = {}
            for k, v in compat.iteritems(x):
                x_iter[k] = v.truncate(before=prior_date, after=date)
            y_iter = y.truncate(before=prior_date, after=date)

            with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
                static = ols(y=y_iter, x=x_iter, **kwds)

            self.compare(static, moving, event_index=i,
                         result_index=n)

        _check_non_raw_results(moving)
Ejemplo n.º 25
0
    def _make_table(self, ax, df, title, height=None):
        if df is None:
            ax.set_visible(False)
            return

        import pandas.tools.plotting as plotting

        idx_nlevels = df.index.nlevels
        col_nlevels = df.columns.nlevels
        # must be convert here to get index levels for colorization
        df = self._insert_index(df)
        tb = plotting.table(ax, df, loc=9)
        tb.set_fontsize(self.font_size)

        if height is None:
            height = 1.0 / (len(df) + 1)

        props = tb.properties()
        for (r, c), cell in compat.iteritems(props['celld']):
            if c == -1:
                cell.set_visible(False)
            elif r < col_nlevels and c < idx_nlevels:
                cell.set_visible(False)
            elif r < col_nlevels or c < idx_nlevels:
                cell.set_facecolor('#AAAAAA')
            cell.set_height(height)

        ax.set_title(title, size=self.font_size)
        ax.axis('off')
Ejemplo n.º 26
0
    def _simple_new(cls, start, stop=None, step=None, name=None,
                    dtype=None, **kwargs):
        result = object.__new__(cls)

        # handle passed None, non-integers
        if start is None and stop is None:
            # empty
            start, stop, step = 0, 0, 1

        if start is None or not is_integer(start):
            try:

                return RangeIndex(start, stop, step, name=name, **kwargs)
            except TypeError:
                return Index(start, stop, step, name=name, **kwargs)

        result._start = start
        result._stop = stop or 0
        result._step = step or 1
        result.name = name
        for k, v in compat.iteritems(kwargs):
            setattr(result, k, v)

        result._reset_identity()
        return result
Ejemplo n.º 27
0
    def test_difference_base(self):
        for name, idx in compat.iteritems(self.indices):
            first = idx[2:]
            second = idx[:4]
            answer = idx[4:]
            result = first.difference(second)

            if isinstance(idx, CategoricalIndex):
                pass
            else:
                self.assertTrue(tm.equalContents(result, answer))

            # GH 10149
            cases = [klass(second.values)
                     for klass in [np.array, Series, list]]
            for case in cases:
                if isinstance(idx, PeriodIndex):
                    msg = "can only call with other PeriodIndex-ed objects"
                    with tm.assertRaisesRegexp(ValueError, msg):
                        result = first.difference(case)
                elif isinstance(idx, CategoricalIndex):
                    pass
                elif isinstance(idx, (DatetimeIndex, TimedeltaIndex)):
                    self.assertEqual(result.__class__, answer.__class__)
                    tm.assert_numpy_array_equal(result.asi8, answer.asi8)
                else:
                    result = first.difference(case)
                    self.assertTrue(tm.equalContents(result, answer))

            if isinstance(idx, MultiIndex):
                msg = "other must be a MultiIndex or a list of tuples"
                with tm.assertRaisesRegexp(TypeError, msg):
                    result = first.difference([1, 2, 3])
Ejemplo n.º 28
0
    def test_symmetric_difference(self):
        for name, idx in compat.iteritems(self.indices):
            first = idx[1:]
            second = idx[:-1]
            if isinstance(idx, CategoricalIndex):
                pass
            else:
                answer = idx[[0, -1]]
                result = first.symmetric_difference(second)
                self.assertTrue(tm.equalContents(result, answer))

            # GH 10149
            cases = [klass(second.values)
                     for klass in [np.array, Series, list]]
            for case in cases:
                if isinstance(idx, PeriodIndex):
                    msg = "can only call with other PeriodIndex-ed objects"
                    with tm.assertRaisesRegexp(ValueError, msg):
                        result = first.symmetric_difference(case)
                elif isinstance(idx, CategoricalIndex):
                    pass
                else:
                    result = first.symmetric_difference(case)
                    self.assertTrue(tm.equalContents(result, answer))

            if isinstance(idx, MultiIndex):
                msg = "other must be a MultiIndex or a list of tuples"
                with tm.assertRaisesRegexp(TypeError, msg):
                    result = first.symmetric_difference([1, 2, 3])

        # 12591 deprecated
        with tm.assert_produces_warning(FutureWarning):
            first.sym_diff(second)
Ejemplo n.º 29
0
    def test_ensure_copied_data(self):
        # Check the "copy" argument of each Index.__new__ is honoured
        # GH12309
        for name, index in compat.iteritems(self.indices):
            init_kwargs = {}
            if isinstance(index, PeriodIndex):
                # Needs "freq" specification:
                init_kwargs['freq'] = index.freq
            elif isinstance(index, (RangeIndex, MultiIndex, CategoricalIndex)):
                # RangeIndex cannot be initialized from data
                # MultiIndex and CategoricalIndex are tested separately
                continue

            index_type = index.__class__
            result = index_type(index.values, copy=True, **init_kwargs)
            tm.assert_index_equal(index, result)
            tm.assert_numpy_array_equal(index.values, result.values,
                                        check_same='copy')

            if not isinstance(index, PeriodIndex):
                result = index_type(index.values, copy=False, **init_kwargs)
                tm.assert_numpy_array_equal(index.values, result.values,
                                            check_same='same')
                tm.assert_numpy_array_equal(index._values, result._values,
                                            check_same='same')
            else:
                # .values an object array of Period, thus copied
                result = index_type(ordinal=index.asi8, copy=False,
                                    **init_kwargs)
                tm.assert_numpy_array_equal(index._values, result._values,
                                            check_same='same')
Ejemplo n.º 30
0
    def test_union_base(self):
        for name, idx in compat.iteritems(self.indices):
            first = idx[3:]
            second = idx[:5]
            everything = idx
            union = first.union(second)
            self.assertTrue(tm.equalContents(union, everything))

            # GH 10149
            cases = [klass(second.values)
                     for klass in [np.array, Series, list]]
            for case in cases:
                if isinstance(idx, PeriodIndex):
                    msg = "can only call with other PeriodIndex-ed objects"
                    with tm.assertRaisesRegexp(ValueError, msg):
                        result = first.union(case)
                elif isinstance(idx, CategoricalIndex):
                    pass
                else:
                    result = first.union(case)
                    self.assertTrue(tm.equalContents(result, everything))

            if isinstance(idx, MultiIndex):
                msg = "other must be a MultiIndex or a list of tuples"
                with tm.assertRaisesRegexp(TypeError, msg):
                    result = first.union([1, 2, 3])
Ejemplo n.º 31
0
    def test_replace_input_formats(self):
        # both dicts
        to_rep = {'A': np.nan, 'B': 0, 'C': ''}
        values = {'A': 0, 'B': -1, 'C': 'missing'}
        df = DataFrame({
            'A': [np.nan, 0, np.inf],
            'B': [0, 2, 5],
            'C': ['', 'asdf', 'fd']
        })
        filled = df.replace(to_rep, values)
        expected = {}
        for k, v in compat.iteritems(df):
            expected[k] = v.replace(to_rep[k], values[k])
        assert_frame_equal(filled, DataFrame(expected))

        result = df.replace([0, 2, 5], [5, 2, 0])
        expected = DataFrame({
            'A': [np.nan, 5, np.inf],
            'B': [5, 2, 0],
            'C': ['', 'asdf', 'fd']
        })
        assert_frame_equal(result, expected)

        # dict to scalar
        filled = df.replace(to_rep, 0)
        expected = {}
        for k, v in compat.iteritems(df):
            expected[k] = v.replace(to_rep[k], 0)
        assert_frame_equal(filled, DataFrame(expected))

        self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, ''])

        # scalar to dict
        values = {'A': 0, 'B': -1, 'C': 'missing'}
        df = DataFrame({
            'A': [np.nan, 0, np.nan],
            'B': [0, 2, 5],
            'C': ['', 'asdf', 'fd']
        })
        filled = df.replace(np.nan, values)
        expected = {}
        for k, v in compat.iteritems(df):
            expected[k] = v.replace(np.nan, values[k])
        assert_frame_equal(filled, DataFrame(expected))

        # list to list
        to_rep = [np.nan, 0, '']
        values = [-2, -1, 'missing']
        result = df.replace(to_rep, values)
        expected = df.copy()
        for i in range(len(to_rep)):
            expected.replace(to_rep[i], values[i], inplace=True)
        assert_frame_equal(result, expected)

        self.assertRaises(ValueError, df.replace, to_rep, values[1:])

        # list to scalar
        to_rep = [np.nan, 0, '']
        result = df.replace(to_rep, -1)
        expected = df.copy()
        for i in range(len(to_rep)):
            expected.replace(to_rep[i], -1, inplace=True)
        assert_frame_equal(result, expected)
Ejemplo n.º 32
0
class Resolution(object):

    # defined in period.pyx
    # note that these are different from freq codes
    RESO_US = period.US_RESO
    RESO_MS = period.MS_RESO
    RESO_SEC = period.S_RESO
    RESO_MIN = period.T_RESO
    RESO_HR = period.H_RESO
    RESO_DAY = period.D_RESO

    _reso_str_map = {
        RESO_US: 'microsecond',
        RESO_MS: 'millisecond',
        RESO_SEC: 'second',
        RESO_MIN: 'minute',
        RESO_HR: 'hour',
        RESO_DAY: 'day'
    }

    _str_reso_map = dict([(v, k) for k, v in compat.iteritems(_reso_str_map)])

    _reso_freq_map = {
        'year': 'A',
        'quarter': 'Q',
        'month': 'M',
        'day': 'D',
        'hour': 'H',
        'minute': 'T',
        'second': 'S',
        'millisecond': 'L',
        'microsecond': 'U',
        'nanosecond': 'N'
    }

    _freq_reso_map = dict([(v, k)
                           for k, v in compat.iteritems(_reso_freq_map)])

    @classmethod
    def get_str(cls, reso):
        """
        Return resolution str against resolution code.

        Example
        -------
        >>> Resolution.get_str(Resolution.RESO_SEC)
        'second'
        """
        return cls._reso_str_map.get(reso, 'day')

    @classmethod
    def get_reso(cls, resostr):
        """
        Return resolution str against resolution code.

        Example
        -------
        >>> Resolution.get_reso('second')
        2

        >>> Resolution.get_reso('second') == Resolution.RESO_SEC
        True
        """
        return cls._str_reso_map.get(resostr, cls.RESO_DAY)

    @classmethod
    def get_freq_group(cls, resostr):
        """
        Return frequency str against resolution str.

        Example
        -------
        >>> f.Resolution.get_freq_group('day')
        4000
        """
        return get_freq_group(cls.get_freq(resostr))

    @classmethod
    def get_freq(cls, resostr):
        """
        Return frequency str against resolution str.

        Example
        -------
        >>> f.Resolution.get_freq('day')
        'D'
        """
        return cls._reso_freq_map[resostr]

    @classmethod
    def get_str_from_freq(cls, freq):
        """
        Return resolution str against frequency str.

        Example
        -------
        >>> Resolution.get_str_from_freq('H')
        'hour'
        """
        return cls._freq_reso_map.get(freq, 'day')

    @classmethod
    def get_reso_from_freq(cls, freq):
        """
        Return resolution code against frequency str.

        Example
        -------
        >>> Resolution.get_reso_from_freq('H')
        4

        >>> Resolution.get_reso_from_freq('H') == Resolution.RESO_HR
        True
        """
        return cls.get_reso(cls.get_str_from_freq(freq))
Ejemplo n.º 33
0
 def test_dict_iterators(self):
     assert next(itervalues({1: 2})) == 2
     assert next(iterkeys({1: 2})) == 1
     assert next(iteritems({1: 2})) == (1, 2)
Ejemplo n.º 34
0
def lreshape(data, groups, dropna=True, label=None):
    """
    Reshape long-format data to wide. Generalized inverse of DataFrame.pivot

    Parameters
    ----------
    data : DataFrame
    groups : dict
        {new_name : list_of_columns}
    dropna : boolean, default True

    Examples
    --------
    >>> import pandas as pd
    >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
    ...                      'team': ['Red Sox', 'Yankees'],
    ...                      'year1': [2007, 2008], 'year2': [2008, 2008]})
    >>> data
       hr1  hr2     team  year1  year2
    0  514  545  Red Sox   2007   2008
    1  573  526  Yankees   2007   2008

    >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
          team   hr  year
    0  Red Sox  514  2007
    1  Yankees  573  2007
    2  Red Sox  545  2008
    3  Yankees  526  2008

    Returns
    -------
    reshaped : DataFrame
    """
    if isinstance(groups, dict):
        keys = list(groups.keys())
        values = list(groups.values())
    else:
        keys, values = zip(*groups)

    all_cols = list(set.union(*[set(x) for x in values]))
    id_cols = list(data.columns.difference(all_cols))

    K = len(values[0])

    for seq in values:
        if len(seq) != K:
            raise ValueError('All column lists must be same length')

    mdata = {}
    pivot_cols = []

    for target, names in zip(keys, values):
        to_concat = [data[col].values for col in names]
        mdata[target] = _concat._concat_compat(to_concat)
        pivot_cols.append(target)

    for col in id_cols:
        mdata[col] = np.tile(data[col].values, K)

    if dropna:
        mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
        for c in pivot_cols:
            mask &= notnull(mdata[c])
        if not mask.all():
            mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata))

    return DataFrame(mdata, columns=id_cols + pivot_cols)
Ejemplo n.º 35
0
 def intframe(self):
     # force these all to int64 to avoid platform testing issues
     return pd.DataFrame(dict([(c, s) for c, s in
                               compat.iteritems(_intframe)]),
                         dtype=np.int64)
Ejemplo n.º 36
0
    def _aggregate(self, arg, *args, **kwargs):
        """
        provide an implementation for the aggregators

        Parameters
        ----------
        arg : string, dict, function
        *args : args to pass on to the function
        **kwargs : kwargs to pass on to the function

        Returns
        -------
        tuple of result, how

        Notes
        -----
        how can be a string describe the required post-processing, or
        None if not required
        """
        is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
        is_nested_renamer = False

        _axis = kwargs.pop('_axis', None)
        if _axis is None:
            _axis = getattr(self, 'axis', 0)
        _level = kwargs.pop('_level', None)

        if isinstance(arg, compat.string_types):
            return self._try_aggregate_string_function(arg, *args,
                                                       **kwargs), None

        if isinstance(arg, dict):

            # aggregate based on the passed dict
            if _axis != 0:  # pragma: no cover
                raise ValueError('Can only pass dict with axis=0')

            obj = self._selected_obj

            def nested_renaming_depr(level=4):
                # deprecation of nested renaming
                # GH 15931
                warnings.warn(("using a dict with renaming "
                               "is deprecated and will be removed in a future "
                               "version"),
                              FutureWarning,
                              stacklevel=level)

            # if we have a dict of any non-scalars
            # eg. {'A' : ['mean']}, normalize all to
            # be list-likes
            if any(is_aggregator(x) for x in compat.itervalues(arg)):
                new_arg = compat.OrderedDict()
                for k, v in compat.iteritems(arg):
                    if not isinstance(v, (tuple, list, dict)):
                        new_arg[k] = [v]
                    else:
                        new_arg[k] = v

                    # the keys must be in the columns
                    # for ndim=2, or renamers for ndim=1

                    # ok for now, but deprecated
                    # {'A': { 'ra': 'mean' }}
                    # {'A': { 'ra': ['mean'] }}
                    # {'ra': ['mean']}

                    # not ok
                    # {'ra' : { 'A' : 'mean' }}
                    if isinstance(v, dict):
                        is_nested_renamer = True

                        if k not in obj.columns:
                            msg = ('cannot perform renaming for {key} with a '
                                   'nested dictionary').format(key=k)
                            raise SpecificationError(msg)
                        nested_renaming_depr(4 + (_level or 0))

                    elif isinstance(obj, ABCSeries):
                        nested_renaming_depr()

                arg = new_arg

            else:
                # deprecation of renaming keys
                # GH 15931
                keys = list(compat.iterkeys(arg))
                if (isinstance(obj, ABCDataFrame)
                        and len(obj.columns.intersection(keys)) != len(keys)):
                    nested_renaming_depr()

            from pandas.core.reshape.concat import concat

            def _agg_1dim(name, how, subset=None):
                """
                aggregate a 1-dim with how
                """
                colg = self._gotitem(name, ndim=1, subset=subset)
                if colg.ndim != 1:
                    raise SpecificationError("nested dictionary is ambiguous "
                                             "in aggregation")
                return colg.aggregate(how, _level=(_level or 0) + 1)

            def _agg_2dim(name, how):
                """
                aggregate a 2-dim with how
                """
                colg = self._gotitem(self._selection, ndim=2, subset=obj)
                return colg.aggregate(how, _level=None)

            def _agg(arg, func):
                """
                run the aggregations over the arg with func
                return an OrderedDict
                """
                result = compat.OrderedDict()
                for fname, agg_how in compat.iteritems(arg):
                    result[fname] = func(fname, agg_how)
                return result

            # set the final keys
            keys = list(compat.iterkeys(arg))
            result = compat.OrderedDict()

            # nested renamer
            if is_nested_renamer:
                result = list(_agg(arg, _agg_1dim).values())

                if all(isinstance(r, dict) for r in result):

                    result, results = compat.OrderedDict(), result
                    for r in results:
                        result.update(r)
                    keys = list(compat.iterkeys(result))

                else:

                    if self._selection is not None:
                        keys = None

            # some selection on the object
            elif self._selection is not None:

                sl = set(self._selection_list)

                # we are a Series like object,
                # but may have multiple aggregations
                if len(sl) == 1:

                    result = _agg(
                        arg, lambda fname, agg_how: _agg_1dim(
                            self._selection, agg_how))

                # we are selecting the same set as we are aggregating
                elif not len(sl - set(keys)):

                    result = _agg(arg, _agg_1dim)

                # we are a DataFrame, with possibly multiple aggregations
                else:

                    result = _agg(arg, _agg_2dim)

            # no selection
            else:

                try:
                    result = _agg(arg, _agg_1dim)
                except SpecificationError:

                    # we are aggregating expecting all 1d-returns
                    # but we have 2d
                    result = _agg(arg, _agg_2dim)

            # combine results

            def is_any_series():
                # return a boolean if we have *any* nested series
                return any(
                    isinstance(r, ABCSeries)
                    for r in compat.itervalues(result))

            def is_any_frame():
                # return a boolean if we have *any* nested series
                return any(
                    isinstance(r, ABCDataFrame)
                    for r in compat.itervalues(result))

            if isinstance(result, list):
                return concat(result, keys=keys, axis=1), True

            elif is_any_frame():
                # we have a dict of DataFrames
                # return a MI DataFrame

                return concat([result[k] for k in keys], keys=keys,
                              axis=1), True

            elif isinstance(self, ABCSeries) and is_any_series():

                # we have a dict of Series
                # return a MI Series
                try:
                    result = concat(result)
                except TypeError:
                    # we want to give a nice error here if
                    # we have non-same sized objects, so
                    # we don't automatically broadcast

                    raise ValueError("cannot perform both aggregation "
                                     "and transformation operations "
                                     "simultaneously")

                return result, True

            # fall thru
            from pandas import DataFrame, Series
            try:
                result = DataFrame(result)
            except ValueError:

                # we have a dict of scalars
                result = Series(result, name=getattr(self, 'name', None))

            return result, True
        elif is_list_like(arg) and arg not in compat.string_types:
            # we require a list, but not an 'str'
            return self._aggregate_multiple_funcs(arg,
                                                  _level=_level,
                                                  _axis=_axis), None
        else:
            result = None

        f = self._is_cython_func(arg)
        if f and not args and not kwargs:
            return getattr(self, f)(), None

        # caller can react
        return result, True
Ejemplo n.º 37
0
import numpy as np

from pandas import compat
from pandas.util._decorators import cache_readonly
import pandas.util.testing as tm
import pandas as pd

_seriesd = tm.getSeriesData()
_tsd = tm.getTimeSeriesData()

_frame = pd.DataFrame(_seriesd)
_frame2 = pd.DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])
_intframe = pd.DataFrame(dict((k, v.astype(int))
                              for k, v in compat.iteritems(_seriesd)))

_tsframe = pd.DataFrame(_tsd)

_mixed_frame = _frame.copy()
_mixed_frame['foo'] = 'bar'


class TestData(object):

    @cache_readonly
    def frame(self):
        return _frame.copy()

    @cache_readonly
    def frame2(self):
        return _frame2.copy()
Ejemplo n.º 38
0
 def test_iteritems_names(self):
     for k, v in compat.iteritems(self.mixed_frame):
         assert v.name == k
Ejemplo n.º 39
0
class Resolution(object):

    RESO_US = RESO_US
    RESO_MS = RESO_MS
    RESO_SEC = RESO_SEC
    RESO_MIN = RESO_MIN
    RESO_HR = RESO_HR
    RESO_DAY = RESO_DAY

    _reso_str_map = {
        RESO_NS: 'nanosecond',
        RESO_US: 'microsecond',
        RESO_MS: 'millisecond',
        RESO_SEC: 'second',
        RESO_MIN: 'minute',
        RESO_HR: 'hour',
        RESO_DAY: 'day'
    }

    # factor to multiply a value by to convert it to the next finer grained
    # resolution
    _reso_mult_map = {
        RESO_NS: None,
        RESO_US: 1000,
        RESO_MS: 1000,
        RESO_SEC: 1000,
        RESO_MIN: 60,
        RESO_HR: 60,
        RESO_DAY: 24
    }

    _reso_str_bump_map = {
        'D': 'H',
        'H': 'T',
        'T': 'S',
        'S': 'L',
        'L': 'U',
        'U': 'N',
        'N': None
    }

    _str_reso_map = dict([(v, k) for k, v in compat.iteritems(_reso_str_map)])

    _reso_freq_map = {
        'year': 'A',
        'quarter': 'Q',
        'month': 'M',
        'day': 'D',
        'hour': 'H',
        'minute': 'T',
        'second': 'S',
        'millisecond': 'L',
        'microsecond': 'U',
        'nanosecond': 'N'
    }

    _freq_reso_map = dict([(v, k)
                           for k, v in compat.iteritems(_reso_freq_map)])

    @classmethod
    def get_str(cls, reso):
        """
        Return resolution str against resolution code.

        Example
        -------
        >>> Resolution.get_str(Resolution.RESO_SEC)
        'second'
        """
        return cls._reso_str_map.get(reso, 'day')

    @classmethod
    def get_reso(cls, resostr):
        """
        Return resolution str against resolution code.

        Example
        -------
        >>> Resolution.get_reso('second')
        2

        >>> Resolution.get_reso('second') == Resolution.RESO_SEC
        True
        """
        return cls._str_reso_map.get(resostr, cls.RESO_DAY)

    @classmethod
    def get_freq_group(cls, resostr):
        """
        Return frequency str against resolution str.

        Example
        -------
        >>> f.Resolution.get_freq_group('day')
        4000
        """
        return get_freq_group(cls.get_freq(resostr))

    @classmethod
    def get_freq(cls, resostr):
        """
        Return frequency str against resolution str.

        Example
        -------
        >>> f.Resolution.get_freq('day')
        'D'
        """
        return cls._reso_freq_map[resostr]

    @classmethod
    def get_str_from_freq(cls, freq):
        """
        Return resolution str against frequency str.

        Example
        -------
        >>> Resolution.get_str_from_freq('H')
        'hour'
        """
        return cls._freq_reso_map.get(freq, 'day')

    @classmethod
    def get_reso_from_freq(cls, freq):
        """
        Return resolution code against frequency str.

        Example
        -------
        >>> Resolution.get_reso_from_freq('H')
        4

        >>> Resolution.get_reso_from_freq('H') == Resolution.RESO_HR
        True
        """
        return cls.get_reso(cls.get_str_from_freq(freq))

    @classmethod
    def get_stride_from_decimal(cls, value, freq):
        """
        Convert freq with decimal stride into a higher freq with integer stride

        Parameters
        ----------
        value : integer or float
        freq : string
            Frequency string

        Raises
        ------
        ValueError
            If the float cannot be converted to an integer at any resolution.

        Example
        -------
        >>> Resolution.get_stride_from_decimal(1.5, 'T')
        (90, 'S')

        >>> Resolution.get_stride_from_decimal(1.04, 'H')
        (3744, 'S')

        >>> Resolution.get_stride_from_decimal(1, 'D')
        (1, 'D')
        """

        if np.isclose(value % 1, 0):
            return int(value), freq
        else:
            start_reso = cls.get_reso_from_freq(freq)
            if start_reso == 0:
                raise ValueError(
                    "Could not convert to integer offset at any resolution")

            next_value = cls._reso_mult_map[start_reso] * value
            next_name = cls._reso_str_bump_map[freq]
            return cls.get_stride_from_decimal(next_value, next_name)
Ejemplo n.º 40
0
 def test_iteritems(self):
     df = self.klass([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b'])
     for k, v in compat.iteritems(df):
         assert isinstance(v, self.klass._constructor_sliced)
Ejemplo n.º 41
0
def json_normalize(data,
                   record_path=None,
                   meta=None,
                   meta_prefix=None,
                   record_prefix=None,
                   errors='raise',
                   sep='.'):
    """
    "Normalize" semi-structured JSON data into a flat table

    Parameters
    ----------
    data : dict or list of dicts
        Unserialized JSON objects
    record_path : string or list of strings, default None
        Path in each object to list of records. If not passed, data will be
        assumed to be an array of records
    meta : list of paths (string or list of strings), default None
        Fields to use as metadata for each record in resulting table
    record_prefix : string, default None
        If True, prefix records with dotted (?) path, e.g. foo.bar.field if
        path to records is ['foo', 'bar']
    meta_prefix : string, default None
    errors : {'raise', 'ignore'}, default 'raise'

        * 'ignore' : will ignore KeyError if keys listed in meta are not
          always present
        * 'raise' : will raise KeyError if keys listed in meta are not
          always present

        .. versionadded:: 0.20.0

    sep : string, default '.'
        Nested records will generate names separated by sep,
        e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

        .. versionadded:: 0.20.0


    Returns
    -------
    frame : DataFrame

    Examples
    --------

    >>> data = [{'state': 'Florida',
    ...          'shortname': 'FL',
    ...          'info': {
    ...               'governor': 'Rick Scott'
    ...          },
    ...          'counties': [{'name': 'Dade', 'population': 12345},
    ...                      {'name': 'Broward', 'population': 40000},
    ...                      {'name': 'Palm Beach', 'population': 60000}]},
    ...         {'state': 'Ohio',
    ...          'shortname': 'OH',
    ...          'info': {
    ...               'governor': 'John Kasich'
    ...          },
    ...          'counties': [{'name': 'Summit', 'population': 1234},
    ...                       {'name': 'Cuyahoga', 'population': 1337}]}]
    >>> from pandas.io.json import json_normalize
    >>> result = json_normalize(data, 'counties', ['state', 'shortname',
    ...                                           ['info', 'governor']])
    >>> result
             name  population info.governor    state shortname
    0        Dade       12345    Rick Scott  Florida        FL
    1     Broward       40000    Rick Scott  Florida        FL
    2  Palm Beach       60000    Rick Scott  Florida        FL
    3      Summit        1234   John Kasich     Ohio        OH
    4    Cuyahoga        1337   John Kasich     Ohio        OH

    """
    def _pull_field(js, spec):
        result = js
        if isinstance(spec, list):
            for field in spec:
                result = result[field]
        else:
            result = result[spec]

        return result

    if isinstance(data, list) and len(data) == 0:
        return DataFrame()

    # A bit of a hackjob
    if isinstance(data, dict):
        data = [data]

    if record_path is None:
        if any([isinstance(x, dict) for x in compat.itervalues(data[0])]):
            # naive normalization, this is idempotent for flat records
            # and potentially will inflate the data considerably for
            # deeply nested structures:
            #  {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
            #
            # TODO: handle record value which are lists, at least error
            #       reasonably
            data = nested_to_record(data, sep=sep)
        return DataFrame(data)
    elif not isinstance(record_path, list):
        record_path = [record_path]

    if meta is None:
        meta = []
    elif not isinstance(meta, list):
        meta = [meta]

    for i, x in enumerate(meta):
        if not isinstance(x, list):
            meta[i] = [x]

    # Disastrously inefficient for now
    records = []
    lengths = []

    meta_vals = defaultdict(list)
    if not isinstance(sep, compat.string_types):
        sep = str(sep)
    meta_keys = [sep.join(val) for val in meta]

    def _recursive_extract(data, path, seen_meta, level=0):
        if len(path) > 1:
            for obj in data:
                for val, key in zip(meta, meta_keys):
                    if level + 1 == len(val):
                        seen_meta[key] = _pull_field(obj, val[-1])

                _recursive_extract(obj[path[0]],
                                   path[1:],
                                   seen_meta,
                                   level=level + 1)
        else:
            for obj in data:
                recs = _pull_field(obj, path[0])

                # For repeating the metadata later
                lengths.append(len(recs))

                for val, key in zip(meta, meta_keys):
                    if level + 1 > len(val):
                        meta_val = seen_meta[key]
                    else:
                        try:
                            meta_val = _pull_field(obj, val[level:])
                        except KeyError as e:
                            if errors == 'ignore':
                                meta_val = np.nan
                            else:
                                raise \
                                    KeyError("Try running with "
                                             "errors='ignore' as key "
                                             "%s is not always present", e)
                    meta_vals[key].append(meta_val)

                records.extend(recs)

    _recursive_extract(data, record_path, {}, level=0)

    result = DataFrame(records)

    if record_prefix is not None:
        result.rename(columns=lambda x: record_prefix + x, inplace=True)

    # Data types, a problem
    for k, v in compat.iteritems(meta_vals):
        if meta_prefix is not None:
            k = meta_prefix + k

        if k in result:
            raise ValueError('Conflicting metadata name %s, '
                             'need distinguishing prefix ' % k)

        result[k] = np.array(v).repeat(lengths)

    return result
Ejemplo n.º 42
0
    def test_combineSeries(self):

        # Series
        series = self.frame.xs(self.frame.index[0])

        added = self.frame + series

        for key, s in compat.iteritems(added):
            assert_series_equal(s, self.frame[key] + series[key])

        larger_series = series.to_dict()
        larger_series['E'] = 1
        larger_series = Series(larger_series)
        larger_added = self.frame + larger_series

        for key, s in compat.iteritems(self.frame):
            assert_series_equal(larger_added[key], s + series[key])
        self.assertIn('E', larger_added)
        self.assertTrue(np.isnan(larger_added['E']).all())

        # vs mix (upcast) as needed
        added = self.mixed_float + series
        _check_mixed_float(added, dtype='float64')
        added = self.mixed_float + series.astype('float32')
        _check_mixed_float(added, dtype=dict(C=None))
        added = self.mixed_float + series.astype('float16')
        _check_mixed_float(added, dtype=dict(C=None))

        # these raise with numexpr.....as we are adding an int64 to an
        # uint64....weird vs int

        # added = self.mixed_int + (100*series).astype('int64')
        # _check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C =
        # 'int64', D = 'int64'))
        # added = self.mixed_int + (100*series).astype('int32')
        # _check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C =
        # 'int32', D = 'int64'))

        # TimeSeries
        ts = self.tsframe['A']

        # 10890
        # we no longer allow auto timeseries broadcasting
        # and require explict broadcasting
        added = self.tsframe.add(ts, axis='index')

        for key, col in compat.iteritems(self.tsframe):
            result = col + ts
            assert_series_equal(added[key], result, check_names=False)
            self.assertEqual(added[key].name, key)
            if col.name == ts.name:
                self.assertEqual(result.name, 'A')
            else:
                self.assertTrue(result.name is None)

        smaller_frame = self.tsframe[:-5]
        smaller_added = smaller_frame.add(ts, axis='index')

        self.assertTrue(smaller_added.index.equals(self.tsframe.index))

        smaller_ts = ts[:-5]
        smaller_added2 = self.tsframe.add(smaller_ts, axis='index')
        assert_frame_equal(smaller_added, smaller_added2)

        # length 0, result is all-nan
        result = self.tsframe.add(ts[:0], axis='index')
        expected = DataFrame(np.nan,
                             index=self.tsframe.index,
                             columns=self.tsframe.columns)
        assert_frame_equal(result, expected)

        # Frame is all-nan
        result = self.tsframe[:0].add(ts, axis='index')
        expected = DataFrame(np.nan,
                             index=self.tsframe.index,
                             columns=self.tsframe.columns)
        assert_frame_equal(result, expected)

        # empty but with non-empty index
        frame = self.tsframe[:1].reindex(columns=[])
        result = frame.mul(ts, axis='index')
        self.assertEqual(len(result), len(ts))
Ejemplo n.º 43
0
 def test_constructor_subclass_dict(self):
     data = tm.TestSubDict((x, 10.0 * x) for x in range(10))
     series = self.series_klass(data)
     expected = self.series_klass(dict(compat.iteritems(data)))
     self._assert_series_equal(series, expected)
Ejemplo n.º 44
0
    'Min': 'T',
    'min': 'T',
    'ms': 'L',
    'us': 'U'
}

#TODO: Can this be killed?
for _i, _weekday in enumerate(['MON', 'TUE', 'WED', 'THU', 'FRI']):
    for _iweek in range(4):
        _name = 'WOM-%d%s' % (_iweek + 1, _weekday)
        _rule_aliases[_name.replace('-', '@')] = _name

# Note that _rule_aliases is not 1:1 (d[BA]==d[A@DEC]), and so traversal
# order matters when constructing an inverse. we pick one. #2331
_legacy_reverse_map = dict(
    (v, k) for k, v in reversed(sorted(compat.iteritems(_rule_aliases))))


def to_offset(freqstr):
    """
    Return DateOffset object from string representation

    Examples
    --------
    >>> to_offset('5Min')
    Minute(5)
    """
    if freqstr is None:
        return None

    if isinstance(freqstr, DateOffset):
Ejemplo n.º 45
0
    def _aggregate(self, arg, *args, **kwargs):
        """
        provide an implementation for the aggregators

        Parameters
        ----------
        arg : string, dict, function
        *args : args to pass on to the function
        **kwargs : kwargs to pass on to the function

        Returns
        -------
        tuple of result, how

        Notes
        -----
        how can be a string describe the required post-processing, or
        None if not required
        """

        is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
        is_nested_renamer = False

        _level = kwargs.pop('_level', None)
        if isinstance(arg, compat.string_types):
            return getattr(self, arg)(*args, **kwargs), None

        if isinstance(arg, dict):

            # aggregate based on the passed dict
            if self.axis != 0:  # pragma: no cover
                raise ValueError('Can only pass dict with axis=0')

            obj = self._selected_obj

            # if we have a dict of any non-scalars
            # eg. {'A' : ['mean']}, normalize all to
            # be list-likes
            if any(is_aggregator(x) for x in compat.itervalues(arg)):
                new_arg = compat.OrderedDict()
                for k, v in compat.iteritems(arg):
                    if not isinstance(v, (tuple, list, dict)):
                        new_arg[k] = [v]
                    else:
                        new_arg[k] = v

                    # the keys must be in the columns
                    # for ndim=2, or renamers for ndim=1

                    # ok
                    # {'A': { 'ra': 'mean' }}
                    # {'A': { 'ra': ['mean'] }}
                    # {'ra': ['mean']}

                    # not ok
                    # {'ra' : { 'A' : 'mean' }}
                    if isinstance(v, dict):
                        is_nested_renamer = True

                        if k not in obj.columns:
                            raise SpecificationError('cannot perform renaming '
                                                     'for {0} with a nested '
                                                     'dictionary'.format(k))

                arg = new_arg

            from pandas.tools.concat import concat

            def _agg_1dim(name, how, subset=None):
                """
                aggregate a 1-dim with how
                """
                colg = self._gotitem(name, ndim=1, subset=subset)
                if colg.ndim != 1:
                    raise SpecificationError("nested dictionary is ambiguous "
                                             "in aggregation")
                return colg.aggregate(how, _level=(_level or 0) + 1)

            def _agg_2dim(name, how):
                """
                aggregate a 2-dim with how
                """
                colg = self._gotitem(self._selection, ndim=2, subset=obj)
                return colg.aggregate(how, _level=None)

            def _agg(arg, func):
                """
                run the aggregations over the arg with func
                return an OrderedDict
                """
                result = compat.OrderedDict()
                for fname, agg_how in compat.iteritems(arg):
                    result[fname] = func(fname, agg_how)
                return result

            # set the final keys
            keys = list(compat.iterkeys(arg))
            result = compat.OrderedDict()

            # nested renamer
            if is_nested_renamer:
                result = list(_agg(arg, _agg_1dim).values())

                if all(isinstance(r, dict) for r in result):

                    result, results = compat.OrderedDict(), result
                    for r in results:
                        result.update(r)
                    keys = list(compat.iterkeys(result))

                else:

                    if self._selection is not None:
                        keys = None

            # some selection on the object
            elif self._selection is not None:

                sl = set(self._selection_list)

                # we are a Series like object,
                # but may have multiple aggregations
                if len(sl) == 1:

                    result = _agg(
                        arg, lambda fname, agg_how: _agg_1dim(
                            self._selection, agg_how))

                # we are selecting the same set as we are aggregating
                elif not len(sl - set(compat.iterkeys(arg))):

                    result = _agg(arg, _agg_1dim)

                # we are a DataFrame, with possibly multiple aggregations
                else:

                    result = _agg(arg, _agg_2dim)

            # no selection
            else:

                try:
                    result = _agg(arg, _agg_1dim)
                except SpecificationError:

                    # we are aggregating expecting all 1d-returns
                    # but we have 2d
                    result = _agg(arg, _agg_2dim)

            # combine results
            if isinstance(result, list):
                result = concat(result, keys=keys, axis=1)
            elif isinstance(list(compat.itervalues(result))[0], ABCDataFrame):
                result = concat([result[k] for k in keys], keys=keys, axis=1)
            else:
                from pandas import DataFrame
                result = DataFrame(result)

            return result, True
        elif hasattr(arg, '__iter__'):
            return self._aggregate_multiple_funcs(arg, _level=_level), None
        else:
            result = None

        cy_func = self._is_cython_func(arg)
        if cy_func and not args and not kwargs:
            return getattr(self, cy_func)(), None

        # caller can react
        return result, True
Ejemplo n.º 46
0
    def apply(self, func, axis=0, broadcast=None, reduce=None,
              result_type=None):
        """
        Analogous to DataFrame.apply, for SparseDataFrame

        Parameters
        ----------
        func : function
            Function to apply to each column
        axis : {0, 1, 'index', 'columns'}
        broadcast : bool, default False
            For aggregation functions, return object of same size with values
            propagated

            .. deprecated:: 0.23.0
               This argument will be removed in a future version, replaced
               by result_type='broadcast'.

        reduce : boolean or None, default None
            Try to apply reduction procedures. If the DataFrame is empty,
            apply will use reduce to determine whether the result should be a
            Series or a DataFrame. If reduce is None (the default), apply's
            return value will be guessed by calling func an empty Series (note:
            while guessing, exceptions raised by func will be ignored). If
            reduce is True a Series will always be returned, and if False a
            DataFrame will always be returned.

            .. deprecated:: 0.23.0
               This argument will be removed in a future version, replaced
               by result_type='reduce'.

        result_type : {'expand', 'reduce', 'broadcast, None}
            These only act when axis=1 {columns}:

            * 'expand' : list-like results will be turned into columns.
            * 'reduce' : return a Series if possible rather than expanding
              list-like results. This is the opposite to 'expand'.
            * 'broadcast' : results will be broadcast to the original shape
              of the frame, the original index & columns will be retained.

            The default behaviour (None) depends on the return value of the
            applied function: list-like results will be returned as a Series
            of those. However if the apply function returns a Series these
            are expanded to columns.

            .. versionadded:: 0.23.0

        Returns
        -------
        applied : Series or SparseDataFrame
        """
        if not len(self.columns):
            return self
        axis = self._get_axis_number(axis)

        if isinstance(func, np.ufunc):
            new_series = {}
            for k, v in compat.iteritems(self):
                applied = func(v)
                applied.fill_value = func(v.fill_value)
                new_series[k] = applied
            return self._constructor(
                new_series, index=self.index, columns=self.columns,
                default_fill_value=self._default_fill_value,
                default_kind=self._default_kind).__finalize__(self)

        from pandas.core.apply import frame_apply
        op = frame_apply(self,
                         func=func,
                         axis=axis,
                         reduce=reduce,
                         broadcast=broadcast,
                         result_type=result_type)
        return op.get_result()
Ejemplo n.º 47
0
 def test_constructor_subclass_dict(self):
     data = tm.TestSubDict((x, 10.0 * x) for x in range(10))
     series = Series(data)
     refseries = Series(dict(compat.iteritems(data)))
     assert_series_equal(refseries, series)
Ejemplo n.º 48
0
 def test_iterkv_names(self):
     for k, v in compat.iteritems(self.mixed_frame):
         self.assertEqual(v.name, k)
Ejemplo n.º 49
0
    def data(self, convert_dates=True, convert_categoricals=True, index=None):
        """
        Reads observations from Stata file, converting them into a dataframe

        Parameters
        ----------
        convert_dates : boolean, defaults to True
            Convert date variables to DataFrame time values
        convert_categoricals : boolean, defaults to True
            Read value labels and convert columns to Categorical/Factor variables
        index : identifier of index column
            identifier of column that should be used as index of the DataFrame

        Returns
        -------
        y : DataFrame instance
        """
        if self._data_read:
            raise Exception("Data has already been read.")
        self._data_read = True

        stata_dta = self._dataset()

        data = []
        for rownum, line in enumerate(stata_dta):
            # doesn't handle missing value objects, just casts
            # None will only work without missing value object.
            for i, val in enumerate(line):
                #NOTE: This will only be scalar types because missing strings
                # are empty not None in Stata
                if val is None:
                    line[i] = np.nan
            data.append(tuple(line))

        if convert_categoricals:
            self._read_value_labels()

        data = DataFrame(data, columns=self.varlist, index=index)

        cols_ = np.where(self.dtyplist)[0]
        for i in cols_:
            if self.dtyplist[i] is not None:
                col = data.columns[i]
                if data[col].dtype is not np.dtype(object):
                    data[col] = Series(data[col], data[col].index,
                                       self.dtyplist[i])

        if convert_dates:
            cols = np.where(lmap(lambda x: x in _date_formats,
                                 self.fmtlist))[0]
            for i in cols:
                col = data.columns[i]
                data[col] = data[col].apply(_stata_elapsed_date_to_datetime,
                                            args=(self.fmtlist[i], ))

        if convert_categoricals:
            cols = np.where(
                lmap(lambda x: x in compat.iterkeys(self.value_label_dict),
                     self.lbllist))[0]
            for i in cols:
                col = data.columns[i]
                labeled_data = np.copy(data[col])
                labeled_data = labeled_data.astype(object)
                for k, v in compat.iteritems(
                        self.value_label_dict[self.lbllist[i]]):
                    labeled_data[(data[col] == k).values] = v
                data[col] = Categorical.from_array(labeled_data)

        return data
Ejemplo n.º 50
0
 def _check_cast(df, v):
     self.assertEqual(
         list(set([s.dtype.name for _, s in compat.iteritems(df)]))[0],
         v)
Ejemplo n.º 51
0
 def test_deepcopy(self):
     cp = deepcopy(self.frame)
     series = cp['A']
     series[:] = 10
     for idx, value in compat.iteritems(series):
         assert self.frame['A'][idx] != value
Ejemplo n.º 52
0
 def _check_cast(df, v):
     assert (list(set([s.dtype.name
                       for _, s in compat.iteritems(df)]))[0] == v)
Ejemplo n.º 53
0
 def test_series_put_names(self):
     series = self.mixed_frame._series
     for k, v in compat.iteritems(series):
         assert v.name == k
Ejemplo n.º 54
0
 def test_deepcopy(self):
     cp = deepcopy(self.frame)
     series = cp['A']
     series[:] = 10
     for idx, value in compat.iteritems(series):
         self.assertNotEqual(self.frame['A'][idx], value)
Ejemplo n.º 55
0
 def test_iteritems(self):
     df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b'])
     for k, v in compat.iteritems(df):
         self.assertEqual(type(v), Series)
Ejemplo n.º 56
0
def _zip_axes_from_type(typ, new_axes):
    axes = {}
    for ax_ind, ax_name in compat.iteritems(typ._AXIS_NAMES):
        axes[ax_name] = new_axes[ax_ind]
    return axes
Ejemplo n.º 57
0
    def test_to_dict(self):
        test_data = {
            'A': {
                '1': 1,
                '2': 2
            },
            'B': {
                '1': '1',
                '2': '2',
                '3': '3'
            },
        }
        recons_data = DataFrame(test_data).to_dict()

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                self.assertEqual(v2, recons_data[k][k2])

        recons_data = DataFrame(test_data).to_dict("l")

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                self.assertEqual(v2, recons_data[k][int(k2) - 1])

        recons_data = DataFrame(test_data).to_dict("s")

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                self.assertEqual(v2, recons_data[k][k2])

        recons_data = DataFrame(test_data).to_dict("sp")

        expected_split = {
            'columns': ['A', 'B'],
            'index': ['1', '2', '3'],
            'data': [[1.0, '1'], [2.0, '2'], [nan, '3']]
        }

        tm.assert_almost_equal(recons_data, expected_split)

        recons_data = DataFrame(test_data).to_dict("r")

        expected_records = [{
            'A': 1.0,
            'B': '1'
        }, {
            'A': 2.0,
            'B': '2'
        }, {
            'A': nan,
            'B': '3'
        }]

        tm.assert_almost_equal(recons_data, expected_records)

        # GH10844
        recons_data = DataFrame(test_data).to_dict("i")

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                self.assertEqual(v2, recons_data[k2][k])
Ejemplo n.º 58
0
 def test_series_put_names(self):
     series = self.mixed_frame._series
     for k, v in compat.iteritems(series):
         self.assertEqual(v.name, k)
Ejemplo n.º 59
0
def assert_array_dicts_equal(left, right):
    for k, v in compat.iteritems(left):
        assert (np.array_equal(v, right[k]))
Ejemplo n.º 60
0
        for path in files:
            if not _should_count_file(path) or path.startswith('test_'):
                continue

            full_path = os.path.join(directory, path)
            counts[full_path] = _get_file_function_lengths(full_path)

    return counts


counts = doit2()

# counts = _get_file_function_lengths('pandas/tests/test_series.py')

all_counts = []
for k, v in compat.iteritems(counts):
    all_counts.extend(v)
all_counts = np.array(all_counts)

fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot(111)
ax.hist(all_counts, bins=100)
n = len(all_counts)
nmore = (all_counts > 50).sum()
ax.set_title('%s function lengths, n=%d' % ('pandas', n))
ax.set_ylabel('N functions')
ax.set_xlabel('Function length')
ax.text(100,
        300,
        '%.3f%% with > 50 lines' % ((n - nmore) / float(n)),
        fontsize=18)