コード例 #1
0
ファイル: merge.py プロジェクト: 17705724576-M13Kd/pandas
def _factorize_keys(lk, rk, sort=True):
    if com._is_int_or_datetime_dtype(lk) and com._is_int_or_datetime_dtype(rk):
        klass = lib.Int64Factorizer
        lk = com._ensure_int64(lk)
        rk = com._ensure_int64(rk)
    else:
        klass = lib.Factorizer
        lk = com._ensure_object(lk)
        rk = com._ensure_object(rk)

    rizer = klass(max(len(lk), len(rk)))

    llab = rizer.factorize(lk)
    rlab = rizer.factorize(rk)

    count = rizer.get_count()

    if sort:
        uniques = rizer.uniques.to_array()
        llab, rlab = _sort_labels(uniques, llab, rlab)

    # NA group
    lmask = llab == -1; lany = lmask.any()
    rmask = rlab == -1; rany = rmask.any()

    if lany or rany:
        if lany:
            np.putmask(llab, lmask, count)
        if rany:
            np.putmask(rlab, rmask, count)
        count += 1

    return llab, rlab, count
コード例 #2
0
ファイル: merge.py プロジェクト: nitfer/pandas
def _get_single_indexer(join_key, index, sort=False):
    left_key, right_key, count = _factorize_keys(join_key, index, sort=sort)

    left_indexer, right_indexer = algos.left_outer_join(
        com._ensure_int64(left_key), com._ensure_int64(right_key), count, sort=sort
    )

    return left_indexer, right_indexer
コード例 #3
0
ファイル: period.py プロジェクト: Arthurkorn/pandas
    def _from_arraylike(cls, data, freq, tz):

        if not isinstance(data, (np.ndarray, PeriodIndex, DatetimeIndex, Int64Index)):
            if np.isscalar(data) or isinstance(data, Period):
                raise ValueError('PeriodIndex() must be called with a '
                                 'collection of some kind, %s was passed'
                                 % repr(data))

            # other iterable of some kind
            if not isinstance(data, (list, tuple)):
                data = list(data)

            try:
                data = com._ensure_int64(data)
                if freq is None:
                    raise ValueError('freq not specified')
                data = np.array([Period(x, freq=freq).ordinal for x in data],
                                dtype=np.int64)
            except (TypeError, ValueError):
                data = com._ensure_object(data)

                if freq is None and len(data) > 0:
                    freq = getattr(data[0], 'freq', None)

                if freq is None:
                    raise ValueError('freq not specified and cannot be '
                                     'inferred from first element')

                data = _get_ordinals(data, freq)
        else:
            if isinstance(data, PeriodIndex):
                if freq is None or freq == data.freq:
                    freq = data.freq
                    data = data.values
                else:
                    base1, _ = _gfc(data.freq)
                    base2, _ = _gfc(freq)
                    data = period.period_asfreq_arr(data.values, base1,
                                                   base2, 1)
            else:
                if freq is None and len(data) > 0:
                    freq = getattr(data[0], 'freq', None)

                if freq is None:
                    raise ValueError('freq not specified and cannot be '
                                     'inferred from first element')

                if data.dtype != np.int64:
                    if np.issubdtype(data.dtype, np.datetime64):
                        data = dt64arr_to_periodarr(data, freq, tz)
                    else:
                        try:
                            data = com._ensure_int64(data)
                        except (TypeError, ValueError):
                            data = com._ensure_object(data)
                            data = _get_ordinals(data, freq)

        return data, freq
コード例 #4
0
ファイル: algorithms.py プロジェクト: trailsquirrel/pandas
def mode(values):
    """Returns the mode or mode(s) of the passed Series or ndarray (sorted)"""
    # must sort because hash order isn't necessarily defined.
    from pandas.core.series import Series

    if isinstance(values, Series):
        constructor = values._constructor
        values = values.values
    else:
        values = np.asanyarray(values)
        constructor = Series

    dtype = values.dtype
    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)

    elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
        dtype = values.dtype
        values = values.view(np.int64)
        result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)

    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        res = htable.mode_object(values, mask)
        try:
            res = sorted(res)
        except TypeError as e:
            warn("Unable to sort modes: %s" % e)
        result = constructor(res, dtype=dtype)

    return result
コード例 #5
0
def _get_data_algo(values, func_map):
    mask = None
    if com.is_float_dtype(values):
        f = func_map['float64']
        values = com._ensure_float64(values)

    elif com.needs_i8_conversion(values):

        # if we have NaT, punt to object dtype
        mask = com.isnull(values)
        if mask.ravel().any():
            f = func_map['generic']
            values = com._ensure_object(values)
            values[mask] = np.nan
        else:
            f = func_map['int64']
            values = values.view('i8')

    elif com.is_integer_dtype(values):
        f = func_map['int64']
        values = com._ensure_int64(values)
    else:
        f = func_map['generic']
        values = com._ensure_object(values)
    return f, values
コード例 #6
0
ファイル: reshape.py プロジェクト: sthorneycroft/pandas
    def get_result(self):
        # TODO: find a better way than this masking business

        values, value_mask = self.get_new_values()
        columns = self.get_new_columns()
        index = self.get_new_index()

        # filter out missing levels
        if values.shape[1] > 0:
            col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1])
            # rare case, level values not observed
            if len(obs_ids) < self.full_shape[1]:
                inds = (value_mask.sum(0) > 0).nonzero()[0]
                values = com.take_nd(values, inds, axis=1)
                columns = columns[inds]

        # we might have a missing index
        if len(index) != values.shape[0]:
            mask = isnull(index)
            if mask.any():
                l = np.arange(len(index))
                values, orig_values = (np.empty((len(index), values.shape[1])),
                                       values)
                values.fill(np.nan)
                values_indexer = com._ensure_int64(l[~mask])
                for i, j in enumerate(values_indexer):
                    values[j] = orig_values[i]
            else:
                index = index.take(self.unique_groups)

        return DataFrame(values, index=index, columns=columns)
コード例 #7
0
ファイル: test_tseries.py プロジェクト: luwenlong123/pandas
    def _check(dtype):
        obj = np.array(np.random.randn(20), dtype=dtype)

        bins = np.array([6, 12, 20])
        out = np.zeros((3, 4), dtype)
        counts = np.zeros(len(out), dtype=np.int64)
        labels = com._ensure_int64(
            np.repeat(np.arange(3), np.diff(np.r_[0, bins])))

        func = getattr(algos, 'group_ohlc_%s' % dtype)
        func(out, counts, obj[:, None], labels)

        def _ohlc(group):
            if isnull(group).all():
                return np.repeat(nan, 4)
            return [group[0], group.max(), group.min(), group[-1]]

        expected = np.array(
            [_ohlc(obj[:6]), _ohlc(obj[6:12]),
             _ohlc(obj[12:])])

        assert_almost_equal(out, expected)
        tm.assert_numpy_array_equal(counts, np.array([6, 6, 8],
                                                     dtype=np.int64))

        obj[:6] = nan
        func(out, counts, obj[:, None], labels)
        expected[0] = nan
        assert_almost_equal(out, expected)
コード例 #8
0
def value_counts(values, sort=True, ascending=False):
    """
    Compute a histogram of the counts of non-null values

    Returns
    -------
    value_counts : Series
    """
    from collections import defaultdict
    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = lib.value_count_int64(values)
        result = Series(counts, index=keys)
    else:
        counter = defaultdict(lambda: 0)
        values = values[com.notnull(values)]
        for value in values:
            counter[value] += 1
        result = Series(counter)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    return result
コード例 #9
0
ファイル: tdi.py プロジェクト: AbnerZheng/pandas
    def delete(self, loc):
        """
        Make a new DatetimeIndex with passed location(s) deleted.

        Parameters
        ----------
        loc: int, slice or array of ints
            Indicate which sub-arrays to remove.

        Returns
        -------
        new_index : TimedeltaIndex
        """
        new_tds = np.delete(self.asi8, loc)

        freq = 'infer'
        if is_integer(loc):
            if loc in (0, -len(self), -1, len(self) - 1):
                freq = self.freq
        else:
            if com.is_list_like(loc):
                loc = lib.maybe_indices_to_slice(
                    com._ensure_int64(np.array(loc)), len(self))
            if isinstance(loc, slice) and loc.step in (1, None):
                if (loc.start in (0, None) or loc.stop in (len(self), None)):
                    freq = self.freq

        return TimedeltaIndex(new_tds, name=self.name, freq=freq)
コード例 #10
0
ファイル: algorithms.py プロジェクト: bburan-galenea/pandas
def mode(values):
    """Returns the mode or mode(s) of the passed Series or ndarray (sorted)"""
    # must sort because hash order isn't necessarily defined.
    from pandas.core.series import Series

    if isinstance(values, Series):
        constructor = values._constructor
        values = values.values
    else:
        values = np.asanyarray(values)
        constructor = Series

    dtype = values.dtype
    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)

    elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
        dtype = values.dtype
        values = values.view(np.int64)
        result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)

    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        res = htable.mode_object(values, mask)
        try:
            res = sorted(res)
        except TypeError as e:
            warn("Unable to sort modes: %s" % e)
        result = constructor(res, dtype=dtype)

    return result
コード例 #11
0
ファイル: algorithms.py プロジェクト: takluyver/pandas
def value_counts(values, sort=True, ascending=False):
    """
    Compute a histogram of the counts of non-null values

    Returns
    -------
    value_counts : Series
    """
    from collections import defaultdict
    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = lib.value_count_int64(values)
        result = Series(counts, index=keys)
    else:
        counter = defaultdict(lambda: 0)
        values = values[com.notnull(values)]
        for value in values:
            counter[value] += 1
        result = Series(counter)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    return result
コード例 #12
0
    def delete(self, loc):
        """
        Make a new DatetimeIndex with passed location(s) deleted.

        Parameters
        ----------
        loc: int, slice or array of ints
            Indicate which sub-arrays to remove.

        Returns
        -------
        new_index : TimedeltaIndex
        """
        new_tds = np.delete(self.asi8, loc)

        freq = 'infer'
        if lib.is_integer(loc):
            if loc in (0, -len(self), -1, len(self) - 1):
                freq = self.freq
        else:
            if com.is_list_like(loc):
                loc = lib.maybe_indices_to_slice(
                    com._ensure_int64(np.array(loc)))
            if isinstance(loc, slice) and loc.step in (1, None):
                if (loc.start in (0, None) or loc.stop in (len(self), None)):
                    freq = self.freq

        return TimedeltaIndex(new_tds, name=self.name, freq=freq)
コード例 #13
0
ファイル: test_tseries.py プロジェクト: Zando2011/pandas
    def _check(dtype):
        obj = np.array(np.random.randn(20), dtype=dtype)

        bins = np.array([6, 12, 20])
        out = np.zeros((3, 4), dtype)
        counts = np.zeros(len(out), dtype=np.int64)
        labels = com._ensure_int64(np.repeat(
            np.arange(3), np.diff(np.r_[0, bins])))

        func = getattr(algos, 'group_ohlc_%s' % dtype)
        func(out, counts, obj[:, None], labels)

        def _ohlc(group):
            if isnull(group).all():
                return np.repeat(nan, 4)
            return [group[0], group.max(), group.min(), group[-1]]

        expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])
                             ])

        assert_almost_equal(out, expected)
        assert_almost_equal(counts, [6, 6, 8])

        obj[:6] = nan
        func(out, counts, obj[:, None], labels)
        expected[0] = nan
        assert_almost_equal(out, expected)
コード例 #14
0
ファイル: reshape.py プロジェクト: AjayRamanathan/pandas
    def get_result(self):
        # TODO: find a better way than this masking business

        values, value_mask = self.get_new_values()
        columns = self.get_new_columns()
        index = self.get_new_index()

        # filter out missing levels
        if values.shape[1] > 0:
            col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1])
            # rare case, level values not observed
            if len(obs_ids) < self.full_shape[1]:
                inds = (value_mask.sum(0) > 0).nonzero()[0]
                values = com.take_nd(values, inds, axis=1)
                columns = columns[inds]

        # we might have a missing index
        if len(index) != values.shape[0]:
            mask = isnull(index)
            if mask.any():
                l = np.arange(len(index))
                values, orig_values = np.empty((len(index),values.shape[1])), values
                values.fill(np.nan)
                values_indexer = com._ensure_int64(l[~mask])
                for i, j in enumerate(values_indexer):
                    values[j] = orig_values[i]
            else:
                index = index.take(self.unique_groups)
        
        return DataFrame(values, index=index, columns=columns)
コード例 #15
0
ファイル: algorithms.py プロジェクト: Garrett-R/pandas
def _get_data_algo(values, func_map):
    mask = None
    if com.is_float_dtype(values):
        f = func_map['float64']
        values = com._ensure_float64(values)

    elif com.needs_i8_conversion(values):

        # if we have NaT, punt to object dtype
        mask = com.isnull(values)
        if mask.ravel().any():
            f = func_map['generic']
            values = com._ensure_object(values)
            values[mask] = np.nan
        else:
            f = func_map['int64']
            values = values.view('i8')

    elif com.is_integer_dtype(values):
        f = func_map['int64']
        values = com._ensure_int64(values)
    else:
        f = func_map['generic']
        values = com._ensure_object(values)
    return f, values
コード例 #16
0
ファイル: algorithms.py プロジェクト: paddymul/pandas
def value_counts(values, sort=True, ascending=False):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order

    Returns
    -------
    value_counts : Series
    """
    from pandas.core.series import Series
    from collections import defaultdict
    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = lib.value_count_int64(values)
        result = Series(counts, index=keys)
    else:
        counter = defaultdict(lambda: 0)
        values = values[com.notnull(values)]
        for value in values:
            counter[value] += 1
        result = Series(counter)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    return result
コード例 #17
0
 def take(self, indices, axis=0):
     """
     Analogous to ndarray.take
     """
     maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices))
     if isinstance(maybe_slice, slice):
         return self[maybe_slice]
     return super(DatetimeIndexOpsMixin, self).take(indices, axis)
コード例 #18
0
 def take(self, indices, axis=0):
     """
     Analogous to ndarray.take
     """
     maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices))
     if isinstance(maybe_slice, slice):
         return self[maybe_slice]
     return super(DatetimeIndexOpsMixin, self).take(indices, axis)
コード例 #19
0
ファイル: algorithms.py プロジェクト: yaduart/pandas
 def func(arr, indexer, out, fill_value=np.nan):
     indexer = com._ensure_int64(indexer)
     _take_nd_generic(arr,
                      indexer,
                      out,
                      axis=axis,
                      fill_value=fill_value,
                      mask_info=mask_info)
コード例 #20
0
ファイル: merge.py プロジェクト: nitfer/pandas
def _get_multiindex_indexer(join_keys, index, sort=False):
    shape = []
    labels = []
    for level, key in zip(index.levels, join_keys):
        llab, rlab, count = _factorize_keys(level, key, sort=False)
        labels.append(rlab)
        shape.append(count)

    left_group_key = get_group_index(labels, shape)
    right_group_key = get_group_index(index.labels, shape)

    left_group_key, right_group_key, max_groups = _factorize_keys(left_group_key, right_group_key, sort=False)

    left_indexer, right_indexer = algos.left_outer_join(
        com._ensure_int64(left_group_key), com._ensure_int64(right_group_key), max_groups, sort=False
    )

    return left_indexer, right_indexer
コード例 #21
0
ファイル: nanops.py プロジェクト: alfonsodiecko/PYTHON_DIST
def unique1d(values):
    """
    Hash table-based unique
    """
    if np.issubdtype(values.dtype, np.floating):
        table = _hash.Float64HashTable(len(values))
        uniques = np.array(table.unique(com._ensure_float64(values)), dtype=np.float64)
    elif np.issubdtype(values.dtype, np.datetime64):
        table = _hash.Int64HashTable(len(values))
        uniques = table.unique(com._ensure_int64(values))
        uniques = uniques.view("M8[ns]")
    elif np.issubdtype(values.dtype, np.integer):
        table = _hash.Int64HashTable(len(values))
        uniques = table.unique(com._ensure_int64(values))
    else:
        table = _hash.PyObjectHashTable(len(values))
        uniques = table.unique(com._ensure_object(values))
    return uniques
コード例 #22
0
ファイル: index.py プロジェクト: candre717/pandas
 def take(self, indices, axis=0):
     """
     Analogous to ndarray.take
     """
     maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices))
     if isinstance(maybe_slice, slice):
         return self[maybe_slice]
     taken = self.values.take(indices, axis=axis)
     return DatetimeIndex(taken, tz=self.tz, name=self.name)
コード例 #23
0
ファイル: index.py プロジェクト: joaonatali/pandas
 def take(self, indices, axis=0):
     """
     Analogous to ndarray.take
     """
     maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices))
     if isinstance(maybe_slice, slice):
         return self[maybe_slice]
     indices = com._ensure_platform_int(indices)
     taken = self.values.take(indices, axis=axis)
     return self._simple_new(taken, self.name, None, self.tz)
コード例 #24
0
ファイル: index.py プロジェクト: zkluo1/pandas
 def take(self, indices, axis=0):
     """
     Analogous to ndarray.take
     """
     maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices))
     if isinstance(maybe_slice, slice):
         return self[maybe_slice]
     indices = com._ensure_platform_int(indices)
     taken = self.values.take(indices, axis=axis)
     return DatetimeIndex(taken, tz=self.tz, name=self.name)
コード例 #25
0
ファイル: index.py プロジェクト: 17705724576-M13Kd/pandas
    def _simple_new(cls, values, name, freq=None, tz=None):
        if values.dtype != _NS_DTYPE:
            values = com._ensure_int64(values).view(_NS_DTYPE)

        result = values.view(cls)
        result.name = name
        result.offset = freq
        result.tz = tools._maybe_get_tz(tz)

        return result
コード例 #26
0
ファイル: index.py プロジェクト: joaonatali/pandas
    def _simple_new(cls, values, name, freq=None, tz=None):
        if values.dtype != _NS_DTYPE:
            values = com._ensure_int64(values).view(_NS_DTYPE)

        result = values.view(cls)
        result.name = name
        result.offset = freq
        result.tz = tools._maybe_get_tz(tz)

        return result
コード例 #27
0
 def take(self, indices, axis=0):
     """
     Analogous to ndarray.take
     """
     indices = com._ensure_int64(indices)
     maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
     if isinstance(maybe_slice, slice):
         return self[maybe_slice]
     taken = self.asi8.take(com._ensure_platform_int(indices))
     return self._shallow_copy(taken, freq=None)
コード例 #28
0
def unique1d(values):
    """
    Hash table-based unique
    """
    if np.issubdtype(values.dtype, np.floating):
        table = _hash.Float64HashTable(len(values))
        uniques = np.array(table.unique(com._ensure_float64(values)),
                           dtype=np.float64)
    elif np.issubdtype(values.dtype, np.datetime64):
        table = _hash.Int64HashTable(len(values))
        uniques = table.unique(com._ensure_int64(values))
        uniques = uniques.view('M8[ns]')
    elif np.issubdtype(values.dtype, np.integer):
        table = _hash.Int64HashTable(len(values))
        uniques = table.unique(com._ensure_int64(values))
    else:
        table = _hash.PyObjectHashTable(len(values))
        uniques = table.unique(com._ensure_object(values))
    return uniques
コード例 #29
0
ファイル: index.py プロジェクト: MikeLindenau/pandas
 def take(self, indices, axis=0):
     """
     Analogous to ndarray.take
     """
     maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices))
     if isinstance(maybe_slice, slice):
         return self[maybe_slice]
     indices = com._ensure_platform_int(indices)
     taken = self.values.take(indices, axis=axis)
     return self._simple_new(taken, self.name, None, self.tz)
コード例 #30
0
def _make_index_array_level(lev, lab):
    """ create the combined index array, preserving nans, return an array """
    mask = lab == -1
    if not mask.any():
        return lev

    l = np.arange(len(lab))
    mask_labels = np.empty(len(mask[mask]), dtype=object)
    mask_labels.fill(np.nan)
    mask_indexer = com._ensure_int64(l[mask])

    labels = lev
    labels_indexer = com._ensure_int64(l[~mask])

    new_labels = np.empty(tuple([len(lab)]), dtype=object)
    new_labels[labels_indexer] = labels
    new_labels[mask_indexer] = mask_labels

    return new_labels
コード例 #31
0
ファイル: reshape.py プロジェクト: AjayRamanathan/pandas
def _make_index_array_level(lev,lab):
    """ create the combined index array, preserving nans, return an array """
    mask = lab == -1
    if not mask.any():
        return lev

    l = np.arange(len(lab))
    mask_labels  = np.empty(len(mask[mask]),dtype=object)
    mask_labels.fill(np.nan)
    mask_indexer = com._ensure_int64(l[mask])

    labels = lev
    labels_indexer = com._ensure_int64(l[~mask])

    new_labels = np.empty(tuple([len(lab)]),dtype=object)
    new_labels[labels_indexer] = labels
    new_labels[mask_indexer]   = mask_labels

    return new_labels
コード例 #32
0
ファイル: base.py プロジェクト: bjacobowski/pandas
 def take(self, indices, axis=0, **kwargs):
     """
     Analogous to ndarray.take
     """
     indices = com._ensure_int64(indices)
     maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
     if isinstance(maybe_slice, slice):
         return self[maybe_slice]
     taken = self.asi8.take(com._ensure_platform_int(indices))
     return self._shallow_copy(taken, freq=None)
コード例 #33
0
def _value_counts_arraylike(values, dropna=True):
    is_datetimetz = com.is_datetimetz(values)
    is_period = (isinstance(values, gt.ABCPeriodIndex)
                 or com.is_period_arraylike(values))

    orig = values

    from pandas.core.series import Series
    values = Series(values).values
    dtype = values.dtype

    if com.is_datetime_or_timedelta_dtype(dtype) or is_period:
        from pandas.tseries.index import DatetimeIndex
        from pandas.tseries.period import PeriodIndex

        if is_period:
            values = PeriodIndex(values)
            freq = values.freq

        values = values.view(np.int64)
        keys, counts = htable.value_count_scalar64(values, dropna)

        if dropna:
            msk = keys != iNaT
            keys, counts = keys[msk], counts[msk]

        # convert the keys back to the dtype we came in
        keys = keys.astype(dtype)

        # dtype handling
        if is_datetimetz:
            if isinstance(orig, gt.ABCDatetimeIndex):
                tz = orig.tz
            else:
                tz = orig.dt.tz
            keys = DatetimeIndex._simple_new(keys, tz=tz)
        if is_period:
            keys = PeriodIndex._simple_new(keys, freq=freq)

    elif com.is_integer_dtype(dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_scalar64(values, dropna)
    elif com.is_float_dtype(dtype):
        values = com._ensure_float64(values)
        keys, counts = htable.value_count_scalar64(values, dropna)
    else:
        values = com._ensure_object(values)
        mask = com.isnull(values)
        keys, counts = htable.value_count_object(values, mask)
        if not dropna and mask.any():
            keys = np.insert(keys, 0, np.NaN)
            counts = np.insert(counts, 0, mask.sum())

    return keys, counts
コード例 #34
0
ファイル: algorithms.py プロジェクト: takluyver/pandas
def _get_data_algo(values, func_map):
    if com.is_float_dtype(values):
        f = func_map['float64']
        values = com._ensure_float64(values)
    elif com.is_integer_dtype(values):
        f = func_map['int64']
        values = com._ensure_int64(values)
    else:
        f = func_map['generic']
        values = com._ensure_object(values)
    return f, values
コード例 #35
0
def _get_data_algo(values, func_map):
    if com.is_float_dtype(values):
        f = func_map['float64']
        values = com._ensure_float64(values)
    elif com.is_integer_dtype(values):
        f = func_map['int64']
        values = com._ensure_int64(values)
    else:
        f = func_map['generic']
        values = com._ensure_object(values)
    return f, values
コード例 #36
0
ファイル: algorithms.py プロジェクト: SocialQ/pandas
def _get_hash_table_and_cast(values):
    if com.is_float_dtype(values):
        klass = lib.Float64HashTable
        values = com._ensure_float64(values)
    elif com.is_integer_dtype(values):
        klass = lib.Int64HashTable
        values = com._ensure_int64(values)
    else:
        klass = lib.PyObjectHashTable
        values = com._ensure_object(values)
    return klass, values
コード例 #37
0
def _get_hash_table_and_cast(values):
    if com.is_float_dtype(values):
        klass = lib.Float64HashTable
        values = com._ensure_float64(values)
    elif com.is_integer_dtype(values):
        klass = lib.Int64HashTable
        values = com._ensure_int64(values)
    else:
        klass = lib.PyObjectHashTable
        values = com._ensure_object(values)
    return klass, values
コード例 #38
0
ファイル: algorithms.py プロジェクト: jcfr/pandas
def _value_counts_arraylike(values, dropna=True):
    is_datetimetz = com.is_datetimetz(values)
    is_period = (isinstance(values, gt.ABCPeriodIndex) or
                 com.is_period_arraylike(values))

    orig = values

    from pandas.core.series import Series
    values = Series(values).values
    dtype = values.dtype

    if com.is_datetime_or_timedelta_dtype(dtype) or is_period:
        from pandas.tseries.index import DatetimeIndex
        from pandas.tseries.period import PeriodIndex

        if is_period:
            values = PeriodIndex(values)
            freq = values.freq

        values = values.view(np.int64)
        keys, counts = htable.value_count_scalar64(values, dropna)

        if dropna:
            msk = keys != iNaT
            keys, counts = keys[msk], counts[msk]

        # convert the keys back to the dtype we came in
        keys = keys.astype(dtype)

        # dtype handling
        if is_datetimetz:
            if isinstance(orig, gt.ABCDatetimeIndex):
                tz = orig.tz
            else:
                tz = orig.dt.tz
            keys = DatetimeIndex._simple_new(keys, tz=tz)
        if is_period:
            keys = PeriodIndex._simple_new(keys, freq=freq)

    elif com.is_integer_dtype(dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_scalar64(values, dropna)
    elif com.is_float_dtype(dtype):
        values = com._ensure_float64(values)
        keys, counts = htable.value_count_scalar64(values, dropna)
    else:
        values = com._ensure_object(values)
        mask = com.isnull(values)
        keys, counts = htable.value_count_object(values, mask)
        if not dropna and mask.any():
            keys = np.insert(keys, 0, np.NaN)
            counts = np.insert(counts, 0, mask.sum())

    return keys, counts
コード例 #39
0
ファイル: merge.py プロジェクト: stenri/pandas
def _get_multiindex_indexer(join_keys, index, sort=False):
    shape = []
    labels = []
    for level, key in zip(index.levels, join_keys):
        llab, rlab, count = _factorize_keys(level, key, sort=False)
        labels.append(rlab)
        shape.append(count)

    left_group_key = get_group_index(labels, shape)
    right_group_key = get_group_index(index.labels, shape)

    left_group_key, right_group_key, max_groups = \
        _factorize_keys(left_group_key, right_group_key,
                        sort=False)

    left_indexer, right_indexer = \
        algos.left_outer_join(com._ensure_int64(left_group_key),
                              com._ensure_int64(right_group_key),
                              max_groups, sort=False)

    return left_indexer, right_indexer
コード例 #40
0
def _factorize_keys(lk, rk, sort=True):
    if com.is_datetime64tz_dtype(lk) and com.is_datetime64tz_dtype(rk):
        lk = lk.values
        rk = rk.values
    if com.is_int_or_datetime_dtype(lk) and com.is_int_or_datetime_dtype(rk):
        klass = _hash.Int64Factorizer
        lk = com._ensure_int64(com._values_from_object(lk))
        rk = com._ensure_int64(com._values_from_object(rk))
    else:
        klass = _hash.Factorizer
        lk = com._ensure_object(lk)
        rk = com._ensure_object(rk)

    rizer = klass(max(len(lk), len(rk)))

    llab = rizer.factorize(lk)
    rlab = rizer.factorize(rk)

    count = rizer.get_count()

    if sort:
        uniques = rizer.uniques.to_array()
        llab, rlab = _sort_labels(uniques, llab, rlab)

    # NA group
    lmask = llab == -1
    lany = lmask.any()
    rmask = rlab == -1
    rany = rmask.any()

    if lany or rany:
        if lany:
            np.putmask(llab, lmask, count)
        if rany:
            np.putmask(rlab, rmask, count)
        count += 1

    return llab, rlab, count
コード例 #41
0
    def _simple_new(cls, values, name=None, freq=None, **kwargs):
        if not getattr(values, 'dtype', None):
            values = np.array(values, copy=False)
        if values.dtype == np.object_:
            values = tslib.array_to_timedelta64(values)
        if values.dtype != _TD_DTYPE:
            values = com._ensure_int64(values).view(_TD_DTYPE)

        result = object.__new__(cls)
        result._data = values
        result.name = name
        result.freq = freq
        result._reset_identity()
        return result
コード例 #42
0
ファイル: merge.py プロジェクト: srivallapa27/pandas
def _factorize_keys(lk, rk, sort=True):
    if com.is_integer_dtype(lk) and com.is_integer_dtype(rk):
        klass = lib.Int64Factorizer
        lk = com._ensure_int64(lk)
        rk = com._ensure_int64(rk)
    else:
        klass = lib.Factorizer
        lk = com._ensure_object(lk)
        rk = com._ensure_object(rk)

    rizer = klass(max(len(lk), len(rk)))

    llab, _ = rizer.factorize(lk)
    rlab, _ = rizer.factorize(rk)

    count = rizer.get_count()

    if sort:
        llab, rlab = _sort_labels(rizer.uniques, llab, rlab)

        # TODO: na handling

    return llab, rlab, count
コード例 #43
0
ファイル: merge.py プロジェクト: edmoody/pandas
def _factorize_keys(lk, rk, sort=True):
    if com.is_integer_dtype(lk) and com.is_integer_dtype(rk):
        klass = lib.Int64Factorizer
        lk = com._ensure_int64(lk)
        rk = com._ensure_int64(rk)
    else:
        klass = lib.Factorizer
        lk = com._ensure_object(lk)
        rk = com._ensure_object(rk)

    rizer = klass(max(len(lk), len(rk)))

    llab, _ = rizer.factorize(lk)
    rlab, _ = rizer.factorize(rk)

    count = rizer.get_count()

    if sort:
        llab, rlab = _sort_labels(rizer.uniques, llab, rlab)

        # TODO: na handling

    return llab, rlab, count
コード例 #44
0
ファイル: algorithms.py プロジェクト: mrorii/pandas
def _get_data_algo(values, func_map):
    if com.is_float_dtype(values):
        f = func_map["float64"]
        values = com._ensure_float64(values)
    elif com.is_datetime64_dtype(values):
        f = func_map["int64"]
        values = values.view("i8")
    elif com.is_integer_dtype(values):
        f = func_map["int64"]
        values = com._ensure_int64(values)
    else:
        f = func_map["generic"]
        values = com._ensure_object(values)
    return f, values
コード例 #45
0
ファイル: tdi.py プロジェクト: AbnerZheng/pandas
    def _simple_new(cls, values, name=None, freq=None, **kwargs):
        if not getattr(values, 'dtype', None):
            values = np.array(values, copy=False)
        if values.dtype == np.object_:
            values = tslib.array_to_timedelta64(values)
        if values.dtype != _TD_DTYPE:
            values = com._ensure_int64(values).view(_TD_DTYPE)

        result = object.__new__(cls)
        result._data = values
        result.name = name
        result.freq = freq
        result._reset_identity()
        return result
コード例 #46
0
def value_counts(values, sort=True, ascending=False, normalize=False):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram

    Returns
    -------
    value_counts : Series
    """
    from pandas.core.series import Series

    values = np.asarray(values)

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)
    elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):

        dtype = values.dtype
        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values)

        # convert the keys back to the dtype we came in
        keys = Series(keys,dtype=dtype)
    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        keys, counts = htable.value_count_object(values, mask)

    result = Series(counts, index=keys)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    if normalize:
        result = result / float(values.size)

    return result
コード例 #47
0
ファイル: base.py プロジェクト: yaduart/pandas
    def take(self, indices, axis=0, allow_fill=True, fill_value=None):
        indices = com._ensure_int64(indices)

        maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
        if isinstance(maybe_slice, slice):
            return self[maybe_slice]

        taken = self._assert_take_fillable(self.asi8, indices,
                                           allow_fill=allow_fill,
                                           fill_value=fill_value,
                                           na_value=tslib.iNaT)

        # keep freq in PeriodIndex, reset otherwise
        freq = self.freq if isinstance(self, com.ABCPeriodIndex) else None
        return self._shallow_copy(taken, freq=freq)
コード例 #48
0
ファイル: base.py プロジェクト: Allen1203/pandas
    def take(self, indices, axis=0, allow_fill=True, fill_value=None):
        indices = com._ensure_int64(indices)

        maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
        if isinstance(maybe_slice, slice):
            return self[maybe_slice]

        taken = self._assert_take_fillable(self.asi8, indices,
                                           allow_fill=allow_fill,
                                           fill_value=fill_value,
                                           na_value=tslib.iNaT)

        # keep freq in PeriodIndex, reset otherwise
        freq = self.freq if isinstance(self, com.ABCPeriodIndex) else None
        return self._shallow_copy(taken, freq=freq)
コード例 #49
0
def _get_data_algo(values, func_map):
    if com.is_float_dtype(values):
        f = func_map['float64']
        values = com._ensure_float64(values)

    elif com.needs_i8_conversion(values):
        f = func_map['int64']
        values = values.view('i8')

    elif com.is_integer_dtype(values):
        f = func_map['int64']
        values = com._ensure_int64(values)
    else:
        f = func_map['generic']
        values = com._ensure_object(values)
    return f, values
コード例 #50
0
ファイル: base.py プロジェクト: sxwang/pandas
    def take(self, indices, axis=0, allow_fill=True, fill_value=None):
        """
        Analogous to ndarray.take
        """
        indices = com._ensure_int64(indices)
        maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
        if isinstance(maybe_slice, slice):
            return self[maybe_slice]
        taken = self.asi8.take(com._ensure_platform_int(indices))

        # only fill if we are passing a non-None fill_value
        if allow_fill and fill_value is not None:
            mask = indices == -1
            if mask.any():
                taken[mask] = tslib.iNaT
        return self._shallow_copy(taken, freq=None)
コード例 #51
0
ファイル: base.py プロジェクト: stefan-jansen/pandas
    def take(self, indices, axis=0, allow_fill=True, fill_value=None):
        """
        Analogous to ndarray.take
        """
        indices = com._ensure_int64(indices)
        maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
        if isinstance(maybe_slice, slice):
            return self[maybe_slice]
        taken = self.asi8.take(com._ensure_platform_int(indices))

        # only fill if we are passing a non-None fill_value
        if allow_fill and fill_value is not None:
            mask = indices == -1
            if mask.any():
                taken[mask] = tslib.iNaT
        return self._shallow_copy(taken, freq=None)
コード例 #52
0
    def mode(self):
        """
        Returns the mode(s) of the Categorical.

        Empty if nothing occurs at least 2 times.  Always returns `Categorical` even
        if only one value.

        Returns
        -------
        modes : `Categorical` (sorted)
        """

        import pandas.hashtable as htable
        good = self._codes != -1
        result = Categorical(sorted(htable.mode_int64(com._ensure_int64(self._codes[good]))),
                             categories=self.categories,ordered=self.ordered, name=self.name,
                             fastpath=True)
        return result
コード例 #53
0
ファイル: categorical.py プロジェクト: janschulz/pandas
    def mode(self):
        """
        Returns the mode(s) of the Categorical.

        Empty if nothing occurs at least 2 times.  Always returns `Categorical` even
        if only one value.

        Returns
        -------
        modes : `Categorical` (sorted)
        """

        import pandas.hashtable as htable
        good = self._codes != -1
        result = Categorical(sorted(htable.mode_int64(com._ensure_int64(self._codes[good]))),
                             categories=self.categories,ordered=self.ordered, name=self.name,
                             fastpath=True)
        return result
コード例 #54
0
ファイル: groupby.py プロジェクト: cournape/pandas
def get_group_index(label_list, shape):
    """
    For the particular label_list, gets the offsets into the hypothetical list
    representing the totally ordered cartesian product of all possible label
    combinations.
    """
    if len(label_list) == 1:
        return label_list[0]

    n = len(label_list[0])
    group_index = np.zeros(n, dtype=np.int64)
    mask = np.zeros(n, dtype=bool)
    for i in xrange(len(shape)):
        stride = np.prod([x for x in shape[i+1:]], dtype=np.int64)
        group_index += com._ensure_int64(label_list[i]) * stride
        mask |= label_list[i] < 0

    np.putmask(group_index, mask, -1)
    return group_index
コード例 #55
0
ファイル: algorithms.py プロジェクト: yunh-net/pandas
def value_counts(values, sort=True, ascending=False):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order

    Returns
    -------
    value_counts : Series
    """
    from pandas.core.series import Series
    from collections import defaultdict

    values = np.asarray(values)

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = lib.value_count_int64(values)
        result = Series(counts, index=keys)
    else:
        counter = defaultdict(lambda: 0)
        values = values[com.notnull(values)]
        for value in values:
            counter[value] += 1
        result = Series(counter)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    return result
コード例 #56
0
def _compress_group_index(group_index, sort=True):
    """
    Group_index is offsets into cartesian product of all possible labels. This
    space can be huge, so this function compresses it, by computing offsets
    (comp_ids) into the list of unique labels (obs_group_ids).
    """

    uniques = []
    table = lib.Int64HashTable(len(group_index))

    group_index = com._ensure_int64(group_index)

    # note, group labels come out ascending (ie, 1,2,3 etc)
    comp_ids = table.get_labels_groupby(group_index, uniques)

    # these are the unique ones we observed, in the order we observed them
    obs_group_ids = np.array(uniques, dtype='i8')

    if sort and len(obs_group_ids) > 0:
        # sorter is index where elements ought to go
        sorter = obs_group_ids.argsort()

        # reverse_indexer is where elements came from
        reverse_indexer = np.empty(len(sorter), dtype='i4')
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = comp_ids < 0

        # move comp_ids to right locations (ie, unsort ascending labels)
        comp_ids = reverse_indexer.take(comp_ids)
        np.putmask(comp_ids, mask, -1)

        # sort observed ids
        obs_group_ids = obs_group_ids.take(sorter)

    return comp_ids, obs_group_ids
コード例 #57
0
ファイル: algorithms.py プロジェクト: durden/pandas
def value_counts(values, sort=True, ascending=False):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order

    Returns
    -------
    value_counts : Series
    """
    from pandas.core.series import Series

    values = np.asarray(values)

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)
    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        keys, counts = htable.value_count_object(values, mask)

    result = Series(counts, index=keys)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    return result
コード例 #58
0
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None,
                 dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut
    from pandas import Index, PeriodIndex, DatetimeIndex

    name = getattr(values, 'name', None)
    values = Series(values).values

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes

    if com.is_categorical_dtype(values.dtype):
        result = values.value_counts(dropna)

    else:

        dtype = values.dtype
        is_period = com.is_period_arraylike(values)
        is_datetimetz = com.is_datetimetz(values)

        if com.is_datetime_or_timedelta_dtype(
                dtype) or is_period or is_datetimetz:

            if is_period:
                values = PeriodIndex(values)
            elif is_datetimetz:
                tz = getattr(values, 'tz', None)
                values = DatetimeIndex(values).tz_localize(None)

            values = values.view(np.int64)
            keys, counts = htable.value_count_scalar64(values, dropna)

            if dropna:
                from pandas.tslib import iNaT
                msk = keys != iNaT
                keys, counts = keys[msk], counts[msk]

            # localize to the original tz if necessary
            if is_datetimetz:
                keys = DatetimeIndex(keys).tz_localize(tz)

            # convert the keys back to the dtype we came in
            else:
                keys = keys.astype(dtype)

        elif com.is_integer_dtype(dtype):
            values = com._ensure_int64(values)
            keys, counts = htable.value_count_scalar64(values, dropna)
        elif com.is_float_dtype(dtype):
            values = com._ensure_float64(values)
            keys, counts = htable.value_count_scalar64(values, dropna)

        else:
            values = com._ensure_object(values)
            mask = com.isnull(values)
            keys, counts = htable.value_count_object(values, mask)
            if not dropna and mask.any():
                keys = np.insert(keys, 0, np.NaN)
                counts = np.insert(counts, 0, mask.sum())

        if not isinstance(keys, Index):
            keys = Index(keys)
        result = Series(counts, index=keys, name=name)

        if bins is not None:
            # TODO: This next line should be more efficient
            result = result.reindex(np.arange(len(cat.categories)),
                                    fill_value=0)
            result.index = bins[:-1]

    if sort:
        result = result.sort_values(ascending=ascending)

    if normalize:
        result = result / float(values.size)

    return result