Exemple #1
0
    def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None,
                periods=None, copy=False, name=None, tz=None, dtype=None,
                **kwargs):

        if periods is not None:
            if is_float(periods):
                periods = int(periods)
            elif not is_integer(periods):
                raise ValueError('Periods must be a number, got %s' %
                                 str(periods))

        if name is None and hasattr(data, 'name'):
            name = data.name

        if dtype is not None:
            dtype = pandas_dtype(dtype)
            if not is_period_dtype(dtype):
                raise ValueError('dtype must be PeriodDtype')
            if freq is None:
                freq = dtype.freq
            elif freq != dtype.freq:
                msg = 'specified freq and dtype are different'
                raise IncompatibleFrequency(msg)

        if data is None:
            if ordinal is not None:
                data = np.asarray(ordinal, dtype=np.int64)
            else:
                data, freq = cls._generate_range(start, end, periods,
                                                 freq, kwargs)
        else:
            ordinal, freq = cls._from_arraylike(data, freq, tz)
            data = np.array(ordinal, dtype=np.int64, copy=copy)

        return cls._simple_new(data, name=name, freq=freq)
Exemple #2
0
    def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None,
                periods=None, copy=False, name=None, tz=None, dtype=None,
                **kwargs):

        if periods is not None:
            if is_float(periods):
                periods = int(periods)
            elif not is_integer(periods):
                raise ValueError('Periods must be a number, got %s' %
                                 str(periods))

        if name is None and hasattr(data, 'name'):
            name = data.name

        if dtype is not None:
            dtype = pandas_dtype(dtype)
            if not is_period_dtype(dtype):
                raise ValueError('dtype must be PeriodDtype')
            if freq is None:
                freq = dtype.freq
            elif freq != dtype.freq:
                msg = 'specified freq and dtype are different'
                raise IncompatibleFrequency(msg)

        if data is None:
            if ordinal is not None:
                data = np.asarray(ordinal, dtype=np.int64)
            else:
                data, freq = cls._generate_range(start, end, periods,
                                                 freq, kwargs)
        else:
            ordinal, freq = cls._from_arraylike(data, freq, tz)
            data = np.array(ordinal, dtype=np.int64, copy=copy)

        return cls._simple_new(data, name=name, freq=freq)
Exemple #3
0
def _value_counts_arraylike(values, dropna=True):
    is_datetimetz_type = is_datetimetz(values)
    is_period_type = (is_period_dtype(values) or
                      is_period_arraylike(values))

    orig = values

    from pandas.core.series import Series
    values = Series(values).values
    dtype = values.dtype

    if needs_i8_conversion(dtype) or is_period_type:

        from pandas.tseries.index import DatetimeIndex
        from pandas.tseries.period import PeriodIndex

        if is_period_type:
            # values may be an object
            values = PeriodIndex(values)
            freq = values.freq

        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values, dropna)

        if dropna:
            msk = keys != iNaT
            keys, counts = keys[msk], counts[msk]

        # convert the keys back to the dtype we came in
        keys = keys.astype(dtype)

        # dtype handling
        if is_datetimetz_type:
            keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz)
        if is_period_type:
            keys = PeriodIndex._simple_new(keys, freq=freq)

    elif is_signed_integer_dtype(dtype):
        values = _ensure_int64(values)
        keys, counts = htable.value_count_int64(values, dropna)
    elif is_unsigned_integer_dtype(dtype):
        values = _ensure_uint64(values)
        keys, counts = htable.value_count_uint64(values, dropna)
    elif is_float_dtype(dtype):
        values = _ensure_float64(values)
        keys, counts = htable.value_count_float64(values, dropna)
    else:
        values = _ensure_object(values)
        keys, counts = htable.value_count_object(values, dropna)

        mask = isnull(values)
        if not dropna and mask.any():
            keys = np.insert(keys, 0, np.NaN)
            counts = np.insert(counts, 0, mask.sum())

    return keys, counts
Exemple #4
0
    def test_construction(self):
        with tm.assertRaises(ValueError):
            PeriodDtype('xx')

        for s in ['period[D]', 'Period[D]', 'D']:
            dt = PeriodDtype(s)
            self.assertEqual(dt.freq, pd.tseries.offsets.Day())
            self.assertTrue(is_period_dtype(dt))

        for s in ['period[3D]', 'Period[3D]', '3D']:
            dt = PeriodDtype(s)
            self.assertEqual(dt.freq, pd.tseries.offsets.Day(3))
            self.assertTrue(is_period_dtype(dt))

        for s in ['period[26H]', 'Period[26H]', '26H',
                  'period[1D2H]', 'Period[1D2H]', '1D2H']:
            dt = PeriodDtype(s)
            self.assertEqual(dt.freq, pd.tseries.offsets.Hour(26))
            self.assertTrue(is_period_dtype(dt))
Exemple #5
0
    def test_construction(self):
        with tm.assertRaises(ValueError):
            PeriodDtype('xx')

        for s in ['period[D]', 'Period[D]', 'D']:
            dt = PeriodDtype(s)
            self.assertEqual(dt.freq, pd.tseries.offsets.Day())
            self.assertTrue(is_period_dtype(dt))

        for s in ['period[3D]', 'Period[3D]', '3D']:
            dt = PeriodDtype(s)
            self.assertEqual(dt.freq, pd.tseries.offsets.Day(3))
            self.assertTrue(is_period_dtype(dt))

        for s in ['period[26H]', 'Period[26H]', '26H',
                  'period[1D2H]', 'Period[1D2H]', '1D2H']:
            dt = PeriodDtype(s)
            self.assertEqual(dt.freq, pd.tseries.offsets.Hour(26))
            self.assertTrue(is_period_dtype(dt))
Exemple #6
0
 def astype(self, dtype, copy=True, how='start'):
     dtype = pandas_dtype(dtype)
     if is_object_dtype(dtype):
         return self.asobject
     elif is_integer_dtype(dtype):
         return Index(self.values.astype('i8', copy=copy), name=self.name,
                      dtype='i8')
     elif is_datetime64_dtype(dtype):
         return self.to_timestamp(how=how)
     elif is_datetime64tz_dtype(dtype):
         return self.to_timestamp(how=how).tz_localize(dtype.tz)
     elif is_period_dtype(dtype):
         return self.asfreq(freq=dtype.freq)
     raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype)
Exemple #7
0
 def astype(self, dtype, copy=True, how='start'):
     dtype = pandas_dtype(dtype)
     if is_object_dtype(dtype):
         return self.asobject
     elif is_integer_dtype(dtype):
         if copy:
             return self._int64index.copy()
         else:
             return self._int64index
     elif is_datetime64_dtype(dtype):
         return self.to_timestamp(how=how)
     elif is_datetime64tz_dtype(dtype):
         return self.to_timestamp(how=how).tz_localize(dtype.tz)
     elif is_period_dtype(dtype):
         return self.asfreq(freq=dtype.freq)
     raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype)
Exemple #8
0
 def astype(self, dtype, copy=True, how='start'):
     dtype = pandas_dtype(dtype)
     if is_object_dtype(dtype):
         return self.asobject
     elif is_integer_dtype(dtype):
         if copy:
             return self._int64index.copy()
         else:
             return self._int64index
     elif is_datetime64_dtype(dtype):
         return self.to_timestamp(how=how)
     elif is_datetime64tz_dtype(dtype):
         return self.to_timestamp(how=how).tz_localize(dtype.tz)
     elif is_period_dtype(dtype):
         return self.asfreq(freq=dtype.freq)
     raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype)
Exemple #9
0
def as_json_table_type(x):
    """
    Convert a NumPy / pandas type to its corresponding json_table.

    Parameters
    ----------
    x : array or dtype

    Returns
    -------
    t : str
        the Table Schema data types

    Notes
    -----
    This table shows the relationship between NumPy / pandas dtypes,
    and Table Schema dtypes.

    ==============  =================
    Pandas type     Table Schema type
    ==============  =================
    int64           integer
    float64         number
    bool            boolean
    datetime64[ns]  datetime
    timedelta64[ns] duration
    object          str
    categorical     any
    =============== =================
    """
    if is_integer_dtype(x):
        return 'integer'
    elif is_bool_dtype(x):
        return 'boolean'
    elif is_numeric_dtype(x):
        return 'number'
    elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or
          is_period_dtype(x)):
        return 'datetime'
    elif is_timedelta64_dtype(x):
        return 'duration'
    elif is_categorical_dtype(x):
        return 'any'
    elif is_string_dtype(x):
        return 'string'
    else:
        return 'any'
Exemple #10
0
def as_json_table_type(x):
    """
    Convert a NumPy / pandas type to its corresponding json_table.

    Parameters
    ----------
    x : array or dtype

    Returns
    -------
    t : str
        the Table Schema data types

    Notes
    -----
    This table shows the relationship between NumPy / pandas dtypes,
    and Table Schema dtypes.

    ==============  =================
    Pandas type     Table Schema type
    ==============  =================
    int64           integer
    float64         number
    bool            boolean
    datetime64[ns]  datetime
    timedelta64[ns] duration
    object          str
    categorical     any
    =============== =================
    """
    if is_integer_dtype(x):
        return 'integer'
    elif is_bool_dtype(x):
        return 'boolean'
    elif is_numeric_dtype(x):
        return 'number'
    elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x)
          or is_period_dtype(x)):
        return 'datetime'
    elif is_timedelta64_dtype(x):
        return 'duration'
    elif is_categorical_dtype(x):
        return 'any'
    elif is_string_dtype(x):
        return 'string'
    else:
        return 'any'
Exemple #11
0
    def __init__(self,
                 obj,
                 orient,
                 date_format,
                 double_precision,
                 ensure_ascii,
                 date_unit,
                 default_handler=None):
        """
        Adds a `schema` attribut with the Table Schema, resets
        the index (can't do in caller, because the schema inference needs
        to know what the index is, forces orient to records, and forces
        date_format to 'iso'.
        """
        super(JSONTableWriter, self).__init__(obj,
                                              orient,
                                              date_format,
                                              double_precision,
                                              ensure_ascii,
                                              date_unit,
                                              default_handler=default_handler)

        if date_format != 'iso':
            msg = ("Trying to write with `orient='table'` and "
                   "`date_format='%s'`. Table Schema requires dates "
                   "to be formatted with `date_format='iso'`" % date_format)
            raise ValueError(msg)

        self.schema = build_table_schema(obj)

        # TODO: Do this timedelta properly in objToJSON.c See GH #15137
        if ((obj.ndim == 1) and (obj.name in set(obj.index.names))
                or len(obj.columns & obj.index.names)):
            msg = "Overlapping names between the index and columns"
            raise ValueError(msg)

        obj = obj.copy()
        timedeltas = obj.select_dtypes(include=['timedelta']).columns
        if len(timedeltas):
            obj[timedeltas] = obj[timedeltas].applymap(lambda x: x.isoformat())
        # Convert PeriodIndex to datetimes before serialzing
        if is_period_dtype(obj.index):
            obj.index = obj.index.to_timestamp()

        self.obj = obj.reset_index()
        self.date_format = 'iso'
        self.orient = 'records'
Exemple #12
0
def make_field(arr, dtype=None):
    dtype = dtype or arr.dtype
    field = {'name': arr.name or 'values', 'type': as_json_table_type(dtype)}

    if is_categorical_dtype(arr):
        if hasattr(arr, 'categories'):
            cats = arr.categories
            ordered = arr.ordered
        else:
            cats = arr.cat.categories
            ordered = arr.cat.ordered
        field['constraints'] = {"enum": list(cats)}
        field['ordered'] = ordered
    elif is_period_dtype(arr):
        field['freq'] = arr.freqstr
    elif is_datetime64tz_dtype(arr):
        if hasattr(arr, 'dt'):
            field['tz'] = arr.dt.tz.zone
        else:
            field['tz'] = arr.tz.zone
    return field
Exemple #13
0
def make_field(arr, dtype=None):
    dtype = dtype or arr.dtype
    field = {'name': arr.name or 'values',
             'type': as_json_table_type(dtype)}

    if is_categorical_dtype(arr):
        if hasattr(arr, 'categories'):
            cats = arr.categories
            ordered = arr.ordered
        else:
            cats = arr.cat.categories
            ordered = arr.cat.ordered
        field['constraints'] = {"enum": list(cats)}
        field['ordered'] = ordered
    elif is_period_dtype(arr):
        field['freq'] = arr.freqstr
    elif is_datetime64tz_dtype(arr):
        if hasattr(arr, 'dt'):
            field['tz'] = arr.dt.tz.zone
        else:
            field['tz'] = arr.tz.zone
    return field
Exemple #14
0
    def test_basic(self):
        self.assertTrue(is_period_dtype(self.dtype))

        pidx = pd.period_range('2013-01-01 09:00', periods=5, freq='H')

        self.assertTrue(is_period_dtype(pidx.dtype))
        self.assertTrue(is_period_dtype(pidx))
        self.assertTrue(is_period(pidx))

        s = Series(pidx, name='A')
        # dtypes
        # series results in object dtype currently,
        # is_period checks period_arraylike
        self.assertFalse(is_period_dtype(s.dtype))
        self.assertFalse(is_period_dtype(s))
        self.assertTrue(is_period(s))

        self.assertFalse(is_period_dtype(np.dtype('float64')))
        self.assertFalse(is_period_dtype(1.0))
        self.assertFalse(is_period(np.dtype('float64')))
        self.assertFalse(is_period(1.0))
Exemple #15
0
    def test_basic(self):
        self.assertTrue(is_period_dtype(self.dtype))

        pidx = pd.period_range('2013-01-01 09:00', periods=5, freq='H')

        self.assertTrue(is_period_dtype(pidx.dtype))
        self.assertTrue(is_period_dtype(pidx))
        self.assertTrue(is_period(pidx))

        s = Series(pidx, name='A')
        # dtypes
        # series results in object dtype currently,
        # is_period checks period_arraylike
        self.assertFalse(is_period_dtype(s.dtype))
        self.assertFalse(is_period_dtype(s))
        self.assertTrue(is_period(s))

        self.assertFalse(is_period_dtype(np.dtype('float64')))
        self.assertFalse(is_period_dtype(1.0))
        self.assertFalse(is_period(np.dtype('float64')))
        self.assertFalse(is_period(1.0))
Exemple #16
0
def isin(comps, values):
    """
    Compute the isin boolean array

    Parameters
    ----------
    comps: array-like
    values: array-like

    Returns
    -------
    boolean array same length as comps
    """

    if not is_list_like(comps):
        raise TypeError("only list-like objects are allowed to be passed"
                        " to isin(), you passed a "
                        "[{0}]".format(type(comps).__name__))
    if not is_list_like(values):
        raise TypeError("only list-like objects are allowed to be passed"
                        " to isin(), you passed a "
                        "[{0}]".format(type(values).__name__))

    from pandas import DatetimeIndex, PeriodIndex

    if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)):
        values = np.array(list(values), dtype='object')

    if needs_i8_conversion(comps):
        if is_period_dtype(values):
            comps = PeriodIndex(comps)
            values = PeriodIndex(values)
        else:
            comps = DatetimeIndex(comps)
            values = DatetimeIndex(values)

        values = values.asi8
        comps = comps.asi8
    elif is_bool_dtype(comps):

        try:
            comps = np.asarray(comps).view('uint8')
            values = np.asarray(values).view('uint8')
        except TypeError:
            # object array conversion will fail
            pass
    else:
        comps = np.asarray(comps)
        values = np.asarray(values)

    # GH11232
    # work-around for numpy < 1.8 and comparisions on py3
    # faster for larger cases to use np.in1d
    if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000:
        f = lambda x, y: np.in1d(x, np.asarray(list(y)))
    elif is_int64_dtype(comps):
        f = lambda x, y: lib.ismember_int64(x, set(y))
    else:
        f = lambda x, y: lib.ismember(x, set(values))

    return f(comps, values)
Exemple #17
0
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    na_sentinel : int, default -1
        Value to mark "not found"
    size_hint : hint to the hashtable sizer

    Returns
    -------
    labels : the indexer to the original array
    uniques : ndarray (1-d) or Index
        the unique values. Index is returned when passed values is Index or
        Series

    note: an array of Periods will ignore sort as it returns an always sorted
    PeriodIndex
    """
    from pandas import Index, Series, DatetimeIndex, PeriodIndex

    # handling two possibilities here
    # - for a numpy datetimelike simply view as i8 then cast back
    # - for an extension datetimelike view as i8 then
    #   reconstruct from boxed values to transfer metadata
    dtype = None
    if needs_i8_conversion(values):
        if is_period_dtype(values):
            values = PeriodIndex(values)
            vals = values.asi8
        elif is_datetimetz(values):
            values = DatetimeIndex(values)
            vals = values.asi8
        else:
            # numpy dtype
            dtype = values.dtype
            vals = values.view(np.int64)
    else:
        vals = np.asarray(values)

    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(size_hint or len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel, True)

    labels = _ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        uniques, labels = safe_sort(uniques,
                                    labels,
                                    na_sentinel=na_sentinel,
                                    assume_unique=True)

    if dtype is not None:
        uniques = uniques.astype(dtype)

    if isinstance(values, Index):
        uniques = values._shallow_copy(uniques, name=None)
    elif isinstance(values, Series):
        uniques = Index(uniques)
    return labels, uniques
Exemple #18
0
    def __new__(cls,
                data=None,
                ordinal=None,
                freq=None,
                start=None,
                end=None,
                periods=None,
                copy=False,
                name=None,
                tz=None,
                dtype=None,
                **kwargs):

        if periods is not None:
            if is_float(periods):
                periods = int(periods)
            elif not is_integer(periods):
                raise ValueError('Periods must be a number, got %s' %
                                 str(periods))

        if name is None and hasattr(data, 'name'):
            name = data.name

        if dtype is not None:
            dtype = pandas_dtype(dtype)
            if not is_period_dtype(dtype):
                raise ValueError('dtype must be PeriodDtype')
            if freq is None:
                freq = dtype.freq
            elif freq != dtype.freq:
                msg = 'specified freq and dtype are different'
                raise IncompatibleFrequency(msg)

        # coerce freq to freq object, otherwise it can be coerced elementwise
        # which is slow
        if freq:
            freq = Period._maybe_convert_freq(freq)

        if data is None:
            if ordinal is not None:
                data = np.asarray(ordinal, dtype=np.int64)
            else:
                data, freq = cls._generate_range(start, end, periods, freq,
                                                 kwargs)
            return cls._from_ordinals(data, name=name, freq=freq)

        if isinstance(data, PeriodIndex):
            if freq is None or freq == data.freq:  # no freq change
                freq = data.freq
                data = data._values
            else:
                base1, _ = _gfc(data.freq)
                base2, _ = _gfc(freq)
                data = period.period_asfreq_arr(data._values, base1, base2, 1)
            return cls._simple_new(data, name=name, freq=freq)

        # not array / index
        if not isinstance(
                data, (np.ndarray, PeriodIndex, DatetimeIndex, Int64Index)):
            if is_scalar(data) or isinstance(data, Period):
                cls._scalar_data_error(data)

            # other iterable of some kind
            if not isinstance(data, (list, tuple)):
                data = list(data)

            data = np.asarray(data)

        # datetime other than period
        if is_datetime64_dtype(data.dtype):
            data = dt64arr_to_periodarr(data, freq, tz)
            return cls._from_ordinals(data, name=name, freq=freq)

        # check not floats
        if infer_dtype(data) == 'floating' and len(data) > 0:
            raise TypeError("PeriodIndex does not allow "
                            "floating point in construction")

        # anything else, likely an array of strings or periods
        data = _ensure_object(data)
        freq = freq or period.extract_freq(data)
        data = period.extract_ordinals(data, freq)
        return cls._from_ordinals(data, name=name, freq=freq)
Exemple #19
0
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    na_sentinel : int, default -1
        Value to mark "not found"
    size_hint : hint to the hashtable sizer

    Returns
    -------
    labels : the indexer to the original array
    uniques : ndarray (1-d) or Index
        the unique values. Index is returned when passed values is Index or
        Series

    note: an array of Periods will ignore sort as it returns an always sorted
    PeriodIndex
    """
    from pandas import Index, Series, DatetimeIndex, PeriodIndex

    # handling two possibilities here
    # - for a numpy datetimelike simply view as i8 then cast back
    # - for an extension datetimelike view as i8 then
    #   reconstruct from boxed values to transfer metadata
    dtype = None
    if needs_i8_conversion(values):
        if is_period_dtype(values):
            values = PeriodIndex(values)
            vals = values.asi8
        elif is_datetimetz(values):
            values = DatetimeIndex(values)
            vals = values.asi8
        else:
            # numpy dtype
            dtype = values.dtype
            vals = values.view(np.int64)
    else:
        vals = np.asarray(values)

    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(size_hint or len(vals))
    uniques = vec_klass()
    check_nulls = not is_integer_dtype(values)
    labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls)

    labels = _ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
                                    assume_unique=True)

    if dtype is not None:
        uniques = uniques.astype(dtype)

    if isinstance(values, Index):
        uniques = values._shallow_copy(uniques, name=None)
    elif isinstance(values, Series):
        uniques = Index(uniques)
    return labels, uniques