Beispiel #1
0
 def test_orderedDict_ctor(self):
     # GH3283
     import pandas
     import random
     data = OrderedDict([('col%s' % i, random.random()) for i in range(12)])
     s = pandas.Series(data)
     self.assertTrue(all(s.values == list(data.values())))
Beispiel #2
0
def create_mgr(descr, item_shape=None):
    """
    Construct BlockManager from string description.

    String description syntax looks similar to np.matrix initializer.  It looks
    like this::

        a,b,c: f8; d,e,f: i8

    Rules are rather simple:

    * see list of supported datatypes in `create_block` method
    * components are semicolon-separated
    * each component is `NAME,NAME,NAME: DTYPE_ID`
    * whitespace around colons & semicolons are removed
    * components with same DTYPE_ID are combined into single block
    * to force multiple blocks with same dtype, use '-SUFFIX'::

        'a:f8-1; b:f8-2; c:f8-foobar'

    """
    if item_shape is None:
        item_shape = (N, )

    offset = 0
    mgr_items = []
    block_placements = OrderedDict()
    for d in descr.split(';'):
        d = d.strip()
        if not len(d):
            continue
        names, blockstr = d.partition(':')[::2]
        blockstr = blockstr.strip()
        names = names.strip().split(',')

        mgr_items.extend(names)
        placement = list(np.arange(len(names)) + offset)
        try:
            block_placements[blockstr].extend(placement)
        except KeyError:
            block_placements[blockstr] = placement
        offset += len(names)

    mgr_items = Index(mgr_items)

    blocks = []
    num_offset = 0
    for blockstr, placement in block_placements.items():
        typestr = blockstr.split('-')[0]
        blocks.append(create_block(typestr,
                                   placement,
                                   item_shape=item_shape,
                                   num_offset=num_offset, ))
        num_offset += len(placement)

    return BlockManager(sorted(blocks, key=lambda b: b.mgr_locs[0]),
                        [mgr_items] + [np.arange(n) for n in item_shape])
Beispiel #3
0
    def _init_dict(self, data, axes, dtype=None):
        haxis = axes.pop(self._info_axis_number)

        # prefilter if haxis passed
        if haxis is not None:
            haxis = _ensure_index(haxis)
            data = OrderedDict((k, v) for k, v
                               in compat.iteritems(data) if k in haxis)
        else:
            ks = list(data.keys())
            if not isinstance(data, OrderedDict):
                ks = _try_sort(ks)
            haxis = Index(ks)

        for k, v in compat.iteritems(data):
            if isinstance(v, dict):
                data[k] = self._constructor_sliced(v)

        # extract axis for remaining axes & create the slicemap
        raxes = [self._extract_axis(self, data, axis=i)
                 if a is None else a for i, a in enumerate(axes)]
        raxes_sm = self._extract_axes_for_slice(self, raxes)

        # shallow copy
        arrays = []
        haxis_shape = [len(a) for a in raxes]
        for h in haxis:
            v = values = data.get(h)
            if v is None:
                values = np.empty(haxis_shape, dtype=dtype)
                values.fill(np.nan)
            elif isinstance(v, self._constructor_sliced):
                d = raxes_sm.copy()
                d['copy'] = False
                v = v.reindex(**d)
                if dtype is not None:
                    v = v.astype(dtype)
                values = v.values
            arrays.append(values)

        return self._init_arrays(arrays, haxis, [haxis] + raxes)
Beispiel #4
0
    def test_more_flexible_frame_multi_function(self):

        grouped = self.df.groupby('A')

        exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]]))
        exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]]))

        expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1)
        expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)

        d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]])
        result = grouped.aggregate(d)

        assert_frame_equal(result, expected)

        # be careful
        result = grouped.aggregate(
            OrderedDict([['C', np.mean], ['D', [np.mean, np.std]]]))
        expected = grouped.aggregate(
            OrderedDict([['C', np.mean], ['D', [np.mean, np.std]]]))
        assert_frame_equal(result, expected)

        def foo(x):
            return np.mean(x)

        def bar(x):
            return np.std(x, ddof=1)

        # this uses column selection & renaming
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            d = OrderedDict(
                [['C', np.mean],
                 ['D', OrderedDict([['foo', np.mean], ['bar', np.std]])]])
            result = grouped.aggregate(d)

        d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]])
        expected = grouped.aggregate(d)

        assert_frame_equal(result, expected)
Beispiel #5
0
    def test_blocks_compat_GH9037(self):
        index = pd.date_range('20000101', periods=10, freq='H')
        df_mixed = DataFrame(OrderedDict(
            float_1=[
                -0.92077639, 0.77434435, 1.25234727, 0.61485564, -0.60316077,
                0.24653374, 0.28668979, -2.51969012, 0.95748401, -1.02970536
            ],
            int_1=[
                19680418, 75337055, 99973684, 65103179, 79373900, 40314334,
                21290235, 4991321, 41903419, 16008365
            ],
            str_1=[
                '78c608f1', '64a99743', '13d2ff52', 'ca7f4af2', '97236474',
                'bde7e214', '1a6bde47', 'b1190be5', '7a669144', '8d64d068'
            ],
            float_2=[
                -0.0428278, -1.80872357, 3.36042349, -0.7573685, -0.48217572,
                0.86229683, 1.08935819, 0.93898739, -0.03030452, 1.43366348
            ],
            str_2=[
                '14f04af9', 'd085da90', '4bcfac83', '81504caf', '2ffef4a9',
                '08e2f5c4', '07e1af03', 'addbd4a7', '1f6a09ba', '4bfc4d87'
            ],
            int_2=[
                86967717, 98098830, 51927505, 20372254, 12601730, 20884027,
                34193846, 10561746, 24867120, 76131025
            ]),
                             index=index)

        # JSON deserialisation always creates unicode strings
        df_mixed.columns = df_mixed.columns.astype('unicode')

        df_roundtrip = pd.read_json(df_mixed.to_json(orient='split'),
                                    orient='split')
        assert_frame_equal(df_mixed,
                           df_roundtrip,
                           check_index_type=True,
                           check_column_type=True,
                           check_frame_type=True,
                           by_blocks=True,
                           check_exact=True)
Beispiel #6
0
    def test_more_flexible_frame_multi_function(self):
        from pandas import concat

        grouped = self.df.groupby('A')

        exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]]))
        exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]]))

        expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1)
        expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)

        d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]])
        result = grouped.aggregate(d)

        assert_frame_equal(result, expected)

        # be careful
        result = grouped.aggregate(
            OrderedDict([['C', np.mean], ['D', [np.mean, np.std]]]))
        expected = grouped.aggregate(
            OrderedDict([['C', np.mean], ['D', [np.mean, np.std]]]))
        assert_frame_equal(result, expected)

        def foo(x):
            return np.mean(x)

        def bar(x):
            return np.std(x, ddof=1)

        d = OrderedDict(
            [['C', np.mean],
             ['D', OrderedDict([['foo', np.mean], ['bar', np.std]])]])
        result = grouped.aggregate(d)

        d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]])
        expected = grouped.aggregate(d)

        assert_frame_equal(result, expected)
Beispiel #7
0
    def test_multi_function_flexible_mix(self):
        # GH #1268
        grouped = self.df.groupby('A')

        d = OrderedDict([['C',
                          OrderedDict([['foo', 'mean'], ['bar', 'std']])],
                         ['D', 'sum']])
        result = grouped.aggregate(d)
        d2 = OrderedDict(
            [['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
             ['D', ['sum']]])
        result2 = grouped.aggregate(d2)

        d3 = OrderedDict(
            [['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
             ['D', {
                 'sum': 'sum'
             }]])
        expected = grouped.aggregate(d3)

        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)
Beispiel #8
0
class SelectionMixin(object):
    """
    mixin implementing the selection & aggregation interface on a group-like
    object sub-classes need to define: obj, exclusions
    """
    _selection = None
    _internal_names = ['_cache', '__setstate__']
    _internal_names_set = set(_internal_names)

    _builtin_table = OrderedDict((
        (builtins.sum, np.sum),
        (builtins.max, np.max),
        (builtins.min, np.min),
    ))

    _cython_table = OrderedDict((
        (builtins.sum, 'sum'),
        (builtins.max, 'max'),
        (builtins.min, 'min'),
        (np.all, 'all'),
        (np.any, 'any'),
        (np.sum, 'sum'),
        (np.nansum, 'sum'),
        (np.mean, 'mean'),
        (np.nanmean, 'mean'),
        (np.prod, 'prod'),
        (np.nanprod, 'prod'),
        (np.std, 'std'),
        (np.nanstd, 'std'),
        (np.var, 'var'),
        (np.nanvar, 'var'),
        (np.median, 'median'),
        (np.nanmedian, 'median'),
        (np.max, 'max'),
        (np.nanmax, 'max'),
        (np.min, 'min'),
        (np.nanmin, 'min'),
        (np.cumprod, 'cumprod'),
        (np.nancumprod, 'cumprod'),
        (np.cumsum, 'cumsum'),
        (np.nancumsum, 'cumsum'),
    ))

    @property
    def _selection_name(self):
        """
        return a name for myself; this would ideally be called
        the 'name' property, but we cannot conflict with the
        Series.name property which can be set
        """
        if self._selection is None:
            return None  # 'result'
        else:
            return self._selection

    @property
    def _selection_list(self):
        if not isinstance(self._selection,
                          (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)):
            return [self._selection]
        return self._selection

    @cache_readonly
    def _selected_obj(self):

        if self._selection is None or isinstance(self.obj, ABCSeries):
            return self.obj
        else:
            return self.obj[self._selection]

    @cache_readonly
    def ndim(self):
        return self._selected_obj.ndim

    @cache_readonly
    def _obj_with_exclusions(self):
        if self._selection is not None and isinstance(self.obj, ABCDataFrame):
            return self.obj.reindex(columns=self._selection_list)

        if len(self.exclusions) > 0:
            return self.obj.drop(self.exclusions, axis=1)
        else:
            return self.obj

    def __getitem__(self, key):
        if self._selection is not None:
            raise IndexError('Column(s) {selection} already selected'.format(
                selection=self._selection))

        if isinstance(key,
                      (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)):
            if len(self.obj.columns.intersection(key)) != len(key):
                bad_keys = list(set(key).difference(self.obj.columns))
                raise KeyError("Columns not found: {missing}".format(
                    missing=str(bad_keys)[1:-1]))
            return self._gotitem(list(key), ndim=2)

        elif not getattr(self, 'as_index', False):
            if key not in self.obj.columns:
                raise KeyError("Column not found: {key}".format(key=key))
            return self._gotitem(key, ndim=2)

        else:
            if key not in self.obj:
                raise KeyError("Column not found: {key}".format(key=key))
            return self._gotitem(key, ndim=1)

    def _gotitem(self, key, ndim, subset=None):
        """
        sub-classes to define
        return a sliced object

        Parameters
        ----------
        key : string / list of selections
        ndim : 1,2
            requested ndim of result
        subset : object, default None
            subset to act on

        """
        raise AbstractMethodError(self)

    def aggregate(self, func, *args, **kwargs):
        raise AbstractMethodError(self)

    agg = aggregate

    def _try_aggregate_string_function(self, arg, *args, **kwargs):
        """
        if arg is a string, then try to operate on it:
        - try to find a function (or attribute) on ourselves
        - try to find a numpy function
        - raise

        """
        assert isinstance(arg, compat.string_types)

        f = getattr(self, arg, None)
        if f is not None:
            if callable(f):
                return f(*args, **kwargs)

            # people may try to aggregate on a non-callable attribute
            # but don't let them think they can pass args to it
            assert len(args) == 0
            assert len([
                kwarg for kwarg in kwargs if kwarg not in ['axis', '_level']
            ]) == 0
            return f

        f = getattr(np, arg, None)
        if f is not None:
            return f(self, *args, **kwargs)

        raise ValueError("{arg} is an unknown string function".format(arg=arg))

    def _aggregate(self, arg, *args, **kwargs):
        """
        provide an implementation for the aggregators

        Parameters
        ----------
        arg : string, dict, function
        *args : args to pass on to the function
        **kwargs : kwargs to pass on to the function

        Returns
        -------
        tuple of result, how

        Notes
        -----
        how can be a string describe the required post-processing, or
        None if not required
        """
        is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
        is_nested_renamer = False

        _axis = kwargs.pop('_axis', None)
        if _axis is None:
            _axis = getattr(self, 'axis', 0)
        _level = kwargs.pop('_level', None)

        if isinstance(arg, compat.string_types):
            return self._try_aggregate_string_function(arg, *args,
                                                       **kwargs), None

        if isinstance(arg, dict):

            # aggregate based on the passed dict
            if _axis != 0:  # pragma: no cover
                raise ValueError('Can only pass dict with axis=0')

            obj = self._selected_obj

            def nested_renaming_depr(level=4):
                # deprecation of nested renaming
                # GH 15931
                warnings.warn(("using a dict with renaming "
                               "is deprecated and will be removed in a future "
                               "version"),
                              FutureWarning,
                              stacklevel=level)

            # if we have a dict of any non-scalars
            # eg. {'A' : ['mean']}, normalize all to
            # be list-likes
            if any(is_aggregator(x) for x in compat.itervalues(arg)):
                new_arg = compat.OrderedDict()
                for k, v in compat.iteritems(arg):
                    if not isinstance(v, (tuple, list, dict)):
                        new_arg[k] = [v]
                    else:
                        new_arg[k] = v

                    # the keys must be in the columns
                    # for ndim=2, or renamers for ndim=1

                    # ok for now, but deprecated
                    # {'A': { 'ra': 'mean' }}
                    # {'A': { 'ra': ['mean'] }}
                    # {'ra': ['mean']}

                    # not ok
                    # {'ra' : { 'A' : 'mean' }}
                    if isinstance(v, dict):
                        is_nested_renamer = True

                        if k not in obj.columns:
                            msg = ('cannot perform renaming for {key} with a '
                                   'nested dictionary').format(key=k)
                            raise SpecificationError(msg)
                        nested_renaming_depr(4 + (_level or 0))

                    elif isinstance(obj, ABCSeries):
                        nested_renaming_depr()
                    elif (isinstance(obj, ABCDataFrame)
                          and k not in obj.columns):
                        raise KeyError(
                            "Column '{col}' does not exist!".format(col=k))

                arg = new_arg

            else:
                # deprecation of renaming keys
                # GH 15931
                keys = list(compat.iterkeys(arg))
                if (isinstance(obj, ABCDataFrame)
                        and len(obj.columns.intersection(keys)) != len(keys)):
                    nested_renaming_depr()

            from pandas.core.reshape.concat import concat

            def _agg_1dim(name, how, subset=None):
                """
                aggregate a 1-dim with how
                """
                colg = self._gotitem(name, ndim=1, subset=subset)
                if colg.ndim != 1:
                    raise SpecificationError("nested dictionary is ambiguous "
                                             "in aggregation")
                return colg.aggregate(how, _level=(_level or 0) + 1)

            def _agg_2dim(name, how):
                """
                aggregate a 2-dim with how
                """
                colg = self._gotitem(self._selection, ndim=2, subset=obj)
                return colg.aggregate(how, _level=None)

            def _agg(arg, func):
                """
                run the aggregations over the arg with func
                return an OrderedDict
                """
                result = compat.OrderedDict()
                for fname, agg_how in compat.iteritems(arg):
                    result[fname] = func(fname, agg_how)
                return result

            # set the final keys
            keys = list(compat.iterkeys(arg))
            result = compat.OrderedDict()

            # nested renamer
            if is_nested_renamer:
                result = list(_agg(arg, _agg_1dim).values())

                if all(isinstance(r, dict) for r in result):

                    result, results = compat.OrderedDict(), result
                    for r in results:
                        result.update(r)
                    keys = list(compat.iterkeys(result))

                else:

                    if self._selection is not None:
                        keys = None

            # some selection on the object
            elif self._selection is not None:

                sl = set(self._selection_list)

                # we are a Series like object,
                # but may have multiple aggregations
                if len(sl) == 1:

                    result = _agg(
                        arg, lambda fname, agg_how: _agg_1dim(
                            self._selection, agg_how))

                # we are selecting the same set as we are aggregating
                elif not len(sl - set(keys)):

                    result = _agg(arg, _agg_1dim)

                # we are a DataFrame, with possibly multiple aggregations
                else:

                    result = _agg(arg, _agg_2dim)

            # no selection
            else:

                try:
                    result = _agg(arg, _agg_1dim)
                except SpecificationError:

                    # we are aggregating expecting all 1d-returns
                    # but we have 2d
                    result = _agg(arg, _agg_2dim)

            # combine results

            def is_any_series():
                # return a boolean if we have *any* nested series
                return any(
                    isinstance(r, ABCSeries)
                    for r in compat.itervalues(result))

            def is_any_frame():
                # return a boolean if we have *any* nested series
                return any(
                    isinstance(r, ABCDataFrame)
                    for r in compat.itervalues(result))

            if isinstance(result, list):
                return concat(result, keys=keys, axis=1, sort=True), True

            elif is_any_frame():
                # we have a dict of DataFrames
                # return a MI DataFrame

                return concat([result[k] for k in keys], keys=keys,
                              axis=1), True

            elif isinstance(self, ABCSeries) and is_any_series():

                # we have a dict of Series
                # return a MI Series
                try:
                    result = concat(result)
                except TypeError:
                    # we want to give a nice error here if
                    # we have non-same sized objects, so
                    # we don't automatically broadcast

                    raise ValueError("cannot perform both aggregation "
                                     "and transformation operations "
                                     "simultaneously")

                return result, True

            # fall thru
            from pandas import DataFrame, Series
            try:
                result = DataFrame(result)
            except ValueError:

                # we have a dict of scalars
                result = Series(result, name=getattr(self, 'name', None))

            return result, True
        elif is_list_like(arg) and arg not in compat.string_types:
            # we require a list, but not an 'str'
            return self._aggregate_multiple_funcs(arg,
                                                  _level=_level,
                                                  _axis=_axis), None
        else:
            result = None

        f = self._is_cython_func(arg)
        if f and not args and not kwargs:
            return getattr(self, f)(), None

        # caller can react
        return result, True

    def _aggregate_multiple_funcs(self, arg, _level, _axis):
        from pandas.core.reshape.concat import concat

        if _axis != 0:
            raise NotImplementedError("axis other than 0 is not supported")

        if self._selected_obj.ndim == 1:
            obj = self._selected_obj
        else:
            obj = self._obj_with_exclusions

        results = []
        keys = []

        # degenerate case
        if obj.ndim == 1:
            for a in arg:
                try:
                    colg = self._gotitem(obj.name, ndim=1, subset=obj)
                    results.append(colg.aggregate(a))

                    # make sure we find a good name
                    name = com.get_callable_name(a) or a
                    keys.append(name)
                except (TypeError, DataError):
                    pass
                except SpecificationError:
                    raise

        # multiples
        else:
            for index, col in enumerate(obj):
                try:
                    colg = self._gotitem(col,
                                         ndim=1,
                                         subset=obj.iloc[:, index])
                    results.append(colg.aggregate(arg))
                    keys.append(col)
                except (TypeError, DataError):
                    pass
                except ValueError:
                    # cannot aggregate
                    continue
                except SpecificationError:
                    raise

        # if we are empty
        if not len(results):
            raise ValueError("no results")

        try:
            return concat(results, keys=keys, axis=1, sort=False)
        except TypeError:

            # we are concatting non-NDFrame objects,
            # e.g. a list of scalars

            from pandas.core.dtypes.cast import is_nested_object
            from pandas import Series
            result = Series(results, index=keys, name=self.name)
            if is_nested_object(result):
                raise ValueError("cannot combine transform and "
                                 "aggregation operations")
            return result

    def _shallow_copy(self, obj=None, obj_type=None, **kwargs):
        """
        return a new object with the replacement attributes
        """
        if obj is None:
            obj = self._selected_obj.copy()
        if obj_type is None:
            obj_type = self._constructor
        if isinstance(obj, obj_type):
            obj = obj.obj
        for attr in self._attributes:
            if attr not in kwargs:
                kwargs[attr] = getattr(self, attr)
        return obj_type(obj, **kwargs)

    def _is_cython_func(self, arg):
        """
        if we define an internal function for this argument, return it
        """
        return self._cython_table.get(arg)

    def _is_builtin_func(self, arg):
        """
        if we define an builtin function for this argument, return it,
        otherwise return the arg
        """
        return self._builtin_table.get(arg, arg)
def validate_argmax_with_skipna(skipna, args, kwargs):
    """
    If 'Series.argmax' is called via the 'numpy' library,
    the third parameter in its signature is 'out', which
    takes either an ndarray or 'None', so check if the
    'skipna' parameter is either an instance of ndarray or
    is None, since 'skipna' itself should be a boolean
    """

    skipna, args = process_skipna(skipna, args)
    validate_argmax(args, kwargs)
    return skipna


ARGSORT_DEFAULTS = OrderedDict()
ARGSORT_DEFAULTS['axis'] = -1
ARGSORT_DEFAULTS['kind'] = 'quicksort'
ARGSORT_DEFAULTS['order'] = None
validate_argsort = CompatValidator(ARGSORT_DEFAULTS,
                                   fname='argsort',
                                   max_fname_arg_count=0,
                                   method='both')

# two different signatures of argsort, this second validation
# for when the `kind` param is supported
ARGSORT_DEFAULTS_KIND = OrderedDict()
ARGSORT_DEFAULTS_KIND['axis'] = -1
ARGSORT_DEFAULTS_KIND['order'] = None
validate_argsort_kind = CompatValidator(ARGSORT_DEFAULTS_KIND,
                                        fname='argsort',
Beispiel #10
0
def test_agg_misc():
    # test with all three Resampler apis and TimeGrouper

    np.random.seed(1234)
    index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq='D')
    index.name = 'date'
    df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
    df_col = df.reset_index()
    df_mult = df_col.copy()
    df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
                                              names=['index', 'date'])

    r = df.resample('2D')
    cases = [
        r,
        df_col.resample('2D', on='date'),
        df_mult.resample('2D', level='date'),
        df.groupby(pd.Grouper(freq='2D'))
    ]

    # passed lambda
    for t in cases:
        result = t.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)})
        rcustom = t['B'].apply(lambda x: np.std(x, ddof=1))
        expected = pd.concat([r['A'].sum(), rcustom], axis=1)
        assert_frame_equal(result, expected, check_like=True)

    # agg with renamers
    expected = pd.concat(
        [t['A'].sum(), t['B'].sum(), t['A'].mean(), t['B'].mean()], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([('result1', 'A'),
                                                  ('result1', 'B'),
                                                  ('result2', 'A'),
                                                  ('result2', 'B')])

    for t in cases:
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = t[['A', 'B']].agg(
                OrderedDict([('result1', np.sum), ('result2', np.mean)]))
        assert_frame_equal(result, expected, check_like=True)

    # agg with different hows
    expected = pd.concat(
        [t['A'].sum(), t['A'].std(), t['B'].mean(), t['B'].std()], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'), ('A', 'std'),
                                                  ('B', 'mean'), ('B', 'std')])
    for t in cases:
        result = t.agg(
            OrderedDict([('A', ['sum', 'std']), ('B', ['mean', 'std'])]))
        assert_frame_equal(result, expected, check_like=True)

    # equivalent of using a selection list / or not
    for t in cases:
        result = t[['A', 'B']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']})
        assert_frame_equal(result, expected, check_like=True)

    # series like aggs
    for t in cases:
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = t['A'].agg({'A': ['sum', 'std']})
        expected = pd.concat([t['A'].sum(), t['A'].std()], axis=1)
        expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
                                                      ('A', 'std')])
        assert_frame_equal(result, expected, check_like=True)

        expected = pd.concat(
            [t['A'].agg(['sum', 'std']), t['A'].agg(['mean', 'std'])], axis=1)
        expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
                                                      ('A', 'std'),
                                                      ('B', 'mean'),
                                                      ('B', 'std')])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = t['A'].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']})
        assert_frame_equal(result, expected, check_like=True)

    # errors
    # invalid names in the agg specification
    msg = "\"Column 'B' does not exist!\""
    for t in cases:
        with pytest.raises(KeyError, match=msg):
            t[['A']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']})
Beispiel #11
0
REPEAT_DEFAULTS = dict(axis=None)
validate_repeat = CompatValidator(REPEAT_DEFAULTS, fname='repeat',
                                  method='both', max_fname_arg_count=1)

ROUND_DEFAULTS = dict(out=None)
validate_round = CompatValidator(ROUND_DEFAULTS, fname='round',
                                 method='both', max_fname_arg_count=1)

SORT_DEFAULTS = OrderedDict()
SORT_DEFAULTS['axis'] = -1
SORT_DEFAULTS['kind'] = 'quicksort'
SORT_DEFAULTS['order'] = None
validate_sort = CompatValidator(SORT_DEFAULTS, fname='sort',
                                method='kwargs')

STAT_FUNC_DEFAULTS = OrderedDict()
STAT_FUNC_DEFAULTS['dtype'] = None
STAT_FUNC_DEFAULTS['out'] = None

PROD_DEFAULTS = SUM_DEFAULTS = STAT_FUNC_DEFAULTS.copy()
SUM_DEFAULTS['keepdims'] = False
SUM_DEFAULTS['initial'] = None

MEDIAN_DEFAULTS = STAT_FUNC_DEFAULTS.copy()
MEDIAN_DEFAULTS['overwrite_input'] = False
MEDIAN_DEFAULTS['keepdims'] = False

STAT_FUNC_DEFAULTS['keepdims'] = False

validate_stat_func = CompatValidator(STAT_FUNC_DEFAULTS,
                                     method='kwargs')
Beispiel #12
0
def validate_argmax_with_skipna(skipna, args, kwargs):
    """
    If 'Series.argmax' is called via the 'numpy' library,
    the third parameter in its signature is 'out', which
    takes either an ndarray or 'None', so check if the
    'skipna' parameter is either an instance of ndarray or
    is None, since 'skipna' itself should be a boolean
    """

    skipna, args = process_skipna(skipna, args)
    validate_argmax(args, kwargs)
    return skipna


ARGSORT_DEFAULTS = OrderedDict()
ARGSORT_DEFAULTS['axis'] = -1
ARGSORT_DEFAULTS['kind'] = 'quicksort'
ARGSORT_DEFAULTS['order'] = None
validate_argsort = CompatValidator(ARGSORT_DEFAULTS,
                                   fname='argsort',
                                   max_fname_arg_count=0,
                                   method='both')

# two different signatures of argsort, this second validation
# for when the `kind` param is supported
ARGSORT_DEFAULTS_KIND = OrderedDict()
ARGSORT_DEFAULTS_KIND['axis'] = -1
ARGSORT_DEFAULTS_KIND['order'] = None
validate_argsort_kind = CompatValidator(ARGSORT_DEFAULTS_KIND,
                                        fname='argsort',
# Join the data with the stops
stops_joined = stops_small.set_index('DIVA_NR').join(
    stops.set_index('Haltestellennummer'))
stopIDs = list(stops_joined.Haltestellen_Id.dropna().astype(int))
filtered_data = data[data.Haltestellen_Id.isin(stopIDs)]
print len(filtered_data)
df = filtered_data[filtered_data.Nach_Hst_Id.isin(stopIDs)]
print len(df)

# Group by the hours, so we have one average value for each hour of the day
df['hour'] = df['FZ_AB'].str[0:2].astype(int)
df_grouped = df.groupby(['ID_Abschnitt', 'hour']).mean()

# Loop over the data and create a entry with the necessary info for each section
data_json = OrderedDict()
for row in df_grouped.itertuples():

    From = stops_joined.loc[stops_joined.Haltestellen_Id ==
                            row.Haltestellen_Id].index[0]
    To = stops_joined.loc[stops_joined.Haltestellen_Id ==
                          row.Nach_Hst_Id].index[0]
    id_fromTo = str(min(From, To)) + '_' + str(max(From, To))

    # If it does not exist yet (first time this section comes up), create a new entry
    if (id_fromTo not in data_json.keys()):
        data_json[id_fromTo] = OrderedDict()
        data_json[id_fromTo]['Fr'] = From
        data_json[id_fromTo]['To'] = To
        data_json[id_fromTo]['x_from'] = stops_joined.loc[
            stops_joined.Haltestellen_Id == row.Haltestellen_Id].x.iloc[0]
Beispiel #14
0
data_filtered = data[data.ZSID.isin(ids)]
data_filtered.to_csv("../../data/verkehr_filtered_2.csv")

'''
data = pd.read_csv("../../data/traffic/verkehr_filtered.csv", sep=",",  parse_dates=['MessungDatZeit'], date_parser=dateparse)

data['hour'] = data['MessungDatZeit'].dt.hour
#data['day'] = data['MessungDatZeit'].day

df_grouped = data.groupby(['ZSID','hour', 'Richtung']).mean()
print df_grouped.head(30)

df_grouped_all = df_grouped.groupby(['ZSID', 'Richtung']).sum()
print df_grouped_all.head(30)

data_json = OrderedDict()
for row in df_grouped.itertuples():

    key = row.Index[0] + '_' + row.Index[2]
    if (key not in data_json.keys()):
        data_json[key] = OrderedDict()
        #data_json[row.Index[0]]['ZSName'] =  row.ZSName
        data_json[key]['Direction'] =  row.Index[2]
        data_json[key]['EKoord'] =  row.EKoord
        data_json[key]['NKoord'] = row.NKoord
        data_json[key]['all'] = df_grouped_all.loc[row.Index[0], row.Index[2]].AnzFahrzeuge


    data_json[key]['h_' + str(row.Index[1])] =  row.AnzFahrzeuge
    
final = pd.DataFrame.from_dict(data_json, orient='index')
Beispiel #15
0
    def _parse_excel(self,
                     sheetname=0,
                     header=0,
                     skiprows=None,
                     names=None,
                     skip_footer=0,
                     index_col=None,
                     has_index_names=None,
                     parse_cols=None,
                     parse_dates=False,
                     date_parser=None,
                     na_values=None,
                     thousands=None,
                     convert_float=True,
                     true_values=None,
                     false_values=None,
                     verbose=False,
                     dtype=None,
                     squeeze=False,
                     **kwds):

        skipfooter = kwds.pop('skipfooter', None)
        if skipfooter is not None:
            skip_footer = skipfooter

        _validate_header_arg(header)
        if has_index_names is not None:
            warn(
                "\nThe has_index_names argument is deprecated; index names "
                "will be automatically inferred based on index_col.\n"
                "This argmument is still necessary if reading Excel output "
                "from 0.16.2 or prior with index names.",
                FutureWarning,
                stacklevel=3)

        if 'chunksize' in kwds:
            raise NotImplementedError("chunksize keyword of read_excel "
                                      "is not implemented")

        if parse_dates is True and index_col is None:
            warn("The 'parse_dates=True' keyword of read_excel was provided"
                 " without an 'index_col' keyword value.")

        def _parse_cell(cell_contents, cell_typ):
            """converts the contents of the cell into a pandas
               appropriate object"""

            if cell_typ == XL_CELL_DATE:

                if xlrd_0_9_3:
                    # Use the newer xlrd datetime handling.
                    try:
                        cell_contents = \
                            xldate.xldate_as_datetime(cell_contents,
                                                      epoch1904)
                    except OverflowError:
                        return cell_contents
                    # Excel doesn't distinguish between dates and time,
                    # so we treat dates on the epoch as times only.
                    # Also, Excel supports 1900 and 1904 epochs.
                    year = (cell_contents.timetuple())[0:3]
                    if ((not epoch1904 and year == (1899, 12, 31))
                            or (epoch1904 and year == (1904, 1, 1))):
                        cell_contents = time(cell_contents.hour,
                                             cell_contents.minute,
                                             cell_contents.second,
                                             cell_contents.microsecond)
                else:
                    # Use the xlrd <= 0.9.2 date handling.
                    try:
                        dt = xldate.xldate_as_tuple(cell_contents, epoch1904)

                    except xldate.XLDateTooLarge:
                        return cell_contents

                    if dt[0] < MINYEAR:
                        cell_contents = time(*dt[3:])
                    else:
                        cell_contents = datetime(*dt)

            elif cell_typ == XL_CELL_ERROR:
                cell_contents = np.nan
            elif cell_typ == XL_CELL_BOOLEAN:
                cell_contents = bool(cell_contents)
            elif convert_float and cell_typ == XL_CELL_NUMBER:
                # GH5394 - Excel 'numbers' are always floats
                # it's a minimal perf hit and less suprising
                val = int(cell_contents)
                if val == cell_contents:
                    cell_contents = val
            return cell_contents

        ret_dict = False
        if isinstance(sheetname, list):
            sheets = sheetname
            ret_dict = True
        elif sheetname is None:
            sheets = self.sheet_names
            ret_dict = True
        else:
            sheets = [sheetname]

        # handle same-type duplicates.
        sheets = list(OrderedDict.fromkeys(sheets).keys())
        output = OrderedDict()

        import xlrd
        from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN,
                          XL_CELL_NUMBER)

        epoch1904 = self.book.datemode

        # xlrd >= 0.9.3 can return datetime objects directly.
        if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"):
            xlrd_0_9_3 = True
        else:
            xlrd_0_9_3 = False

        # Keep sheetname to maintain backwards compatibility.
        for asheetname in sheets:
            if verbose:
                print("Reading sheet %s" % asheetname)
            if isinstance(asheetname, compat.string_types):
                sheet = self.book.sheet_by_name(asheetname)
            else:  # assume an integer if not a string
                sheet = self.book.sheet_by_index(asheetname)

            data = []
            should_parse = {}

            if sheet.nrows > 5000:
                raise Exception(
                    "The raw file contains more than 5000 rows. Please check if it is correct or split the files (max: 5000 rows) for upload"
                )
            elif kwds.get('MaxTest'):
                continue

            for i in range(sheet.nrows):

                row = []
                for j, (value, typ) in enumerate(
                        zip(sheet.row_values(i), sheet.row_types(i))):
                    if parse_cols is not None and j not in should_parse:
                        should_parse[j] = self._should_parse(j, parse_cols)

                    if parse_cols is None or should_parse[j]:
                        row.append(_parse_cell(value, typ))
                data.append(row)
#            output[asheetname] = data
            if sheet.nrows == 0:
                output[asheetname] = DataFrame()
                continue

            if is_list_like(header) and len(header) == 1:
                header = header[0]

            # forward fill and pull out names for MultiIndex column
            header_names = None
            if header is not None:
                if is_list_like(header):
                    header_names = []
                    control_row = [True for x in data[0]]
                    for row in header:
                        if is_integer(skiprows):
                            row += skiprows

                        data[row], control_row = _fill_mi_header(
                            data[row], control_row)
                        header_name, data[row] = _pop_header_name(
                            data[row], index_col)
                        header_names.append(header_name)

            if is_list_like(index_col):
                # forward fill values for MultiIndex index
                if not is_list_like(header):
                    offset = 1 + header
                else:
                    offset = 1 + max(header)

                for col in index_col:
                    last = data[offset][col]
                    for row in range(offset + 1, len(data)):
                        if data[row][col] == '' or data[row][col] is None:
                            data[row][col] = last
                        else:
                            last = data[row][col]

            if is_list_like(header) and len(header) > 1:
                has_index_names = True

            if kwds.get('parsed'):
                try:
                    parser = TextParser(data,
                                        header=header,
                                        index_col=index_col,
                                        has_index_names=has_index_names,
                                        na_values=na_values,
                                        thousands=thousands,
                                        parse_dates=parse_dates,
                                        date_parser=date_parser,
                                        true_values=true_values,
                                        false_values=false_values,
                                        skiprows=skiprows,
                                        skipfooter=skip_footer,
                                        squeeze=squeeze,
                                        dtype=dtype,
                                        **kwds)
                    output[asheetname] = parser.read()
                    if names is not None:
                        output[asheetname].columns = names
                    if not squeeze or isinstance(output[asheetname],
                                                 DataFrame):
                        output[asheetname].columns = output[
                            asheetname].columns.set_names(header_names)
                except EmptyDataError:
                    # No Data, return an empty DataFrame
                    output[asheetname] = DataFrame()
            else:
                output[asheetname] = data

        if ret_dict or kwds.get('MaxTest'):
            return output
        else:
            return output[asheetname]