コード例 #1
0
ファイル: all.py プロジェクト: caskeep/interpret
def _assign_feature_type(feature_type, unique_count=0):
    if is_string_dtype(feature_type) or (
        is_numeric_dtype(feature_type) and unique_count <= 2
    ):
        return "categorical"
    elif is_numeric_dtype(feature_type):
        return "continuous"
    else:
        return "unknown"
コード例 #2
0
ファイル: test_common.py プロジェクト: Michael-E-Rose/pandas
def test_is_numeric_dtype():
    assert not com.is_numeric_dtype(str)
    assert not com.is_numeric_dtype(np.datetime64)
    assert not com.is_numeric_dtype(np.timedelta64)
    assert not com.is_numeric_dtype(np.array(['a', 'b']))
    assert not com.is_numeric_dtype(np.array([], dtype=np.timedelta64))

    assert com.is_numeric_dtype(int)
    assert com.is_numeric_dtype(float)
    assert com.is_numeric_dtype(np.uint64)
    assert com.is_numeric_dtype(pd.Series([1, 2]))
    assert com.is_numeric_dtype(pd.Index([1, 2.]))
コード例 #3
0
ファイル: nanops.py プロジェクト: mficek/pandas
def _na_for_min_count(values, axis):
    """Return the missing value for `values`

    Parameters
    ----------
    values : ndarray
    axis : int or None
        axis for the reduction

    Returns
    -------
    result : scalar or ndarray
        For 1-D values, returns a scalar of the correct missing type.
        For 2-D values, returns a 1-D array where each element is missing.
    """
    # we either return np.nan or pd.NaT
    if is_numeric_dtype(values):
        values = values.astype('float64')
    fill_value = na_value_for_dtype(values.dtype)

    if values.ndim == 1:
        return fill_value
    else:
        result_shape = (values.shape[:axis] +
                        values.shape[axis + 1:])
        result = np.empty(result_shape, dtype=values.dtype)
        result.fill(fill_value)
        return result
コード例 #4
0
ファイル: all.py プロジェクト: caskeep/interpret
def autogen_schema(X, ordinal_max_items=2, feature_names=None, feature_types=None):
    """ Generates data schema for a given dataset as JSON representable.

    Args:
        X: Dataframe/ndarray to build schema from.
        ordinal_max_items: If a numeric column's cardinality
            is at most this integer,
            consider it as ordinal instead of continuous.
        feature_names: Feature names
        feature_types: Feature types

    Returns:
        A dictionary - schema that encapsulates column information,
        such as type and domain.
    """
    schema = OrderedDict()
    col_number = 0
    if isinstance(X, np.ndarray):
        log.warning(
            "Passing a numpy array to schema autogen when it should be dataframe."
        )
        if feature_names is None:
            X = pd.DataFrame(
                X, columns=["col_" + str(i) for i in range(X.shape[1])]
            ).infer_objects()
        else:
            X = pd.DataFrame(X, columns=feature_names).infer_objects()
    if isinstance(X, NDFrame):
        for name, col_dtype in zip(X.dtypes.index, X.dtypes):
            schema[name] = {}
            if is_numeric_dtype(col_dtype):
                # schema[name]['type'] = 'continuous'
                # TODO: Fix this once we know it works.
                if len(set(X[name])) > ordinal_max_items:
                    schema[name]["type"] = "continuous"
                else:
                    # TODO: Work with ordinal later.
                    schema[name]["type"] = "categorical"
                    # schema[name]['type'] = 'ordinal'
                    # schema[name]['order'] = list(set(X[name]))
            elif is_string_dtype(col_dtype):
                schema[name]["type"] = "categorical"
            else:
                warnings.warn("Unknown column: " + name, RuntimeWarning)
                schema[name]["type"] = "unknown"
            schema[name]["column_number"] = col_number
            col_number += 1

        # Override if feature_types is passed as arg.
        if feature_types is not None:
            for idx, name in enumerate(X.dtypes.index):
                schema[name]["type"] = feature_types[idx]
    else:
        raise TypeError("GA2M only supports numpy arrays or pandas dataframes.")

    return schema
コード例 #5
0
ファイル: table_schema.py プロジェクト: sinhrks/pandas
def as_json_table_type(x):
    """
    Convert a NumPy / pandas type to its corresponding json_table.

    Parameters
    ----------
    x : array or dtype

    Returns
    -------
    t : str
        the Table Schema data types

    Notes
    -----
    This table shows the relationship between NumPy / pandas dtypes,
    and Table Schema dtypes.

    ==============  =================
    Pandas type     Table Schema type
    ==============  =================
    int64           integer
    float64         number
    bool            boolean
    datetime64[ns]  datetime
    timedelta64[ns] duration
    object          str
    categorical     any
    =============== =================
    """
    if is_integer_dtype(x):
        return 'integer'
    elif is_bool_dtype(x):
        return 'boolean'
    elif is_numeric_dtype(x):
        return 'number'
    elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or
          is_period_dtype(x)):
        return 'datetime'
    elif is_timedelta64_dtype(x):
        return 'duration'
    elif is_categorical_dtype(x):
        return 'any'
    elif is_string_dtype(x):
        return 'string'
    else:
        return 'any'
コード例 #6
0
ファイル: nanops.py プロジェクト: Xbar/pandas
        def f(values, axis=None, skipna=True, **kwds):
            if len(self.kwargs) > 0:
                for k, v in compat.iteritems(self.kwargs):
                    if k not in kwds:
                        kwds[k] = v
            try:
                if values.size == 0:

                    # we either return np.nan or pd.NaT
                    if is_numeric_dtype(values):
                        values = values.astype('float64')
                    fill_value = na_value_for_dtype(values.dtype)

                    if values.ndim == 1:
                        return fill_value
                    else:
                        result_shape = (values.shape[:axis] +
                                        values.shape[axis + 1:])
                        result = np.empty(result_shape, dtype=values.dtype)
                        result.fill(fill_value)
                        return result

                if (_USE_BOTTLENECK and skipna and
                        _bn_ok_dtype(values.dtype, bn_name)):
                    result = bn_func(values, axis=axis, **kwds)

                    # prefer to treat inf/-inf as NA, but must compute the func
                    # twice :(
                    if _has_infs(result):
                        result = alt(values, axis=axis, skipna=skipna, **kwds)
                else:
                    result = alt(values, axis=axis, skipna=skipna, **kwds)
            except Exception:
                try:
                    result = alt(values, axis=axis, skipna=skipna, **kwds)
                except ValueError as e:
                    # we want to transform an object array
                    # ValueError message to the more typical TypeError
                    # e.g. this is normally a disallowed function on
                    # object arrays that contain strings

                    if is_object_dtype(values):
                        raise TypeError(e)
                    raise

            return result
コード例 #7
0
ファイル: nanops.py プロジェクト: mficek/pandas
def _maybe_null_out(result, axis, mask, min_count=1):
    if axis is not None and getattr(result, 'ndim', False):
        null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0
        if np.any(null_mask):
            if is_numeric_dtype(result):
                if np.iscomplexobj(result):
                    result = result.astype('c16')
                else:
                    result = result.astype('f8')
                result[null_mask] = np.nan
            else:
                # GH12941, use None to auto cast null
                result[null_mask] = None
    elif result is not tslib.NaT:
        null_mask = mask.size - mask.sum()
        if null_mask < min_count:
            result = np.nan

    return result
コード例 #8
0
    def test_apply_custom_column_types(self):
        data = {
            'to_numerical': [
                '150', '200', '50', '10', '5', '150', '200', '50', '10', '5',
                '1'
            ] * 10,
            'to_text': [
                '150', '200', '50', '10', '5', '150', '200', '50', '10', '5',
                '1'
            ] * 10,
            'to_categorical':
            [150, 200, 50, 10, 5, 150, 200, 50, 10, 5, 1] * 10,
            'stay_categorical': [
                '150', '200', '50', '10', '5', '150', '200', '50', '10', '5',
                '1'
            ] * 10
        }
        df1 = df2 = pd.DataFrame.from_dict(data)

        custom_column_types = {
            'to_numerical': ColumnType.numerical,
            'to_text': ColumnType.text,
            'to_categorical': ColumnType.categorical
        }

        store = Store(df1, df2, custom_column_types=custom_column_types)

        with self.subTest("Apply custom_column_types"):
            self.assertEqual(['to_categorical', 'stay_categorical'],
                             store.type_to_columns[ColumnType.categorical])
            self.assertEqual(['to_text'],
                             store.type_to_columns[ColumnType.text])
            self.assertEqual(['to_numerical'],
                             store.type_to_columns[ColumnType.numerical])

        with self.subTest(
                "Apply numerical conversion for custom_column_types to dataframes"
        ):
            self.assertTrue(is_numeric_dtype(store.df1['to_numerical']))
            self.assertTrue(store.df1['to_numerical'].equals(
                pd.Series([
                    150.0, 200.0, 50.0, 10.0, 5.0, 150.0, 200.0, 50.0, 10.0,
                    5.0, 1.0
                ] * 10)))

        with self.subTest(
                "Apply categorical conversion for custom_column_types to dataframes"
        ):
            self.assertTrue(is_string_dtype(store.df1['to_categorical']))
            self.assertTrue(store.df1['to_categorical'].equals(
                pd.Series([
                    '150', '200', '50', '10', '5', '150', '200', '50', '10',
                    '5', '1'
                ] * 10)))

        with self.subTest(
                "Apply textual conversion for custom_column_types to dataframes"
        ):
            self.assertTrue(is_string_dtype(store.df1['to_text']))
            self.assertTrue(store.df1['to_text'].equals(
                pd.Series([
                    '150', '200', '50', '10', '5', '150', '200', '50', '10',
                    '5', '1'
                ] * 10)))
コード例 #9
0
ファイル: concat.py プロジェクト: DusanMilunovic/pandas
def get_empty_dtype_and_na(join_units):
    """
    Return dtype and N/A values to use when concatenating specified units.

    Returned N/A value may be None which means there was no casting involved.

    Returns
    -------
    dtype
    na
    """
    if len(join_units) == 1:
        blk = join_units[0].block
        if blk is None:
            return np.float64, np.nan

    if is_uniform_reindex(join_units):
        # XXX: integrate property
        empty_dtype = join_units[0].block.dtype
        upcasted_na = join_units[0].block.fill_value
        return empty_dtype, upcasted_na

    has_none_blocks = False
    dtypes = [None] * len(join_units)
    for i, unit in enumerate(join_units):
        if unit.block is None:
            has_none_blocks = True
        else:
            dtypes[i] = unit.dtype

    upcast_classes = defaultdict(list)
    null_upcast_classes = defaultdict(list)
    for dtype, unit in zip(dtypes, join_units):
        if dtype is None:
            continue

        if is_categorical_dtype(dtype):
            upcast_cls = 'category'
        elif is_datetime64tz_dtype(dtype):
            upcast_cls = 'datetimetz'
        elif issubclass(dtype.type, np.bool_):
            upcast_cls = 'bool'
        elif issubclass(dtype.type, np.object_):
            upcast_cls = 'object'
        elif is_datetime64_dtype(dtype):
            upcast_cls = 'datetime'
        elif is_timedelta64_dtype(dtype):
            upcast_cls = 'timedelta'
        elif is_sparse(dtype):
            upcast_cls = dtype.subtype.name
        elif is_extension_array_dtype(dtype):
            upcast_cls = 'object'
        elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
            upcast_cls = dtype.name
        else:
            upcast_cls = 'float'

        # Null blocks should not influence upcast class selection, unless there
        # are only null blocks, when same upcasting rules must be applied to
        # null upcast classes.
        if unit.is_na:
            null_upcast_classes[upcast_cls].append(dtype)
        else:
            upcast_classes[upcast_cls].append(dtype)

    if not upcast_classes:
        upcast_classes = null_upcast_classes

    # create the result
    if 'object' in upcast_classes:
        return np.dtype(np.object_), np.nan
    elif 'bool' in upcast_classes:
        if has_none_blocks:
            return np.dtype(np.object_), np.nan
        else:
            return np.dtype(np.bool_), None
    elif 'category' in upcast_classes:
        return np.dtype(np.object_), np.nan
    elif 'datetimetz' in upcast_classes:
        dtype = upcast_classes['datetimetz']
        return dtype[0], tslibs.iNaT
    elif 'datetime' in upcast_classes:
        return np.dtype('M8[ns]'), tslibs.iNaT
    elif 'timedelta' in upcast_classes:
        return np.dtype('m8[ns]'), tslibs.iNaT
    else:  # pragma
        try:
            g = np.find_common_type(upcast_classes, [])
        except TypeError:
            # At least one is an ExtensionArray
            return np.dtype(np.object_), np.nan
        else:
            if is_float_dtype(g):
                return g, g.type(np.nan)
            elif is_numeric_dtype(g):
                if has_none_blocks:
                    return np.float64, np.nan
                else:
                    return g, None

    msg = "invalid dtype determination in get_concat_dtype"
    raise AssertionError(msg)
コード例 #10
0
ファイル: hashing.py プロジェクト: zycjss/pandas
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
    """
    Given a 1d array, return an array of deterministic integers.

    .. versionadded:: 0.19.2

    Parameters
    ----------
    vals : ndarray, Categorical
    encoding : string, default 'utf8'
        encoding for data & key when strings
    hash_key : string key to encode, default to _default_hash_key
    categorize : bool, default True
        Whether to first categorize object arrays before hashing. This is more
        efficient when the array contains duplicate values.

        .. versionadded:: 0.20.0

    Returns
    -------
    1d uint64 numpy array of hash values, same length as the vals

    """

    if not hasattr(vals, 'dtype'):
        raise TypeError("must pass a ndarray-like")

    if hash_key is None:
        hash_key = _default_hash_key

    # For categoricals, we hash the categories, then remap the codes to the
    # hash values. (This check is above the complex check so that we don't ask
    # numpy if categorical is a subdtype of complex, as it will choke.
    if is_categorical_dtype(vals.dtype):
        return _hash_categorical(vals, encoding, hash_key)

    # we'll be working with everything as 64-bit values, so handle this
    # 128-bit value early
    if np.issubdtype(vals.dtype, np.complex128):
        return hash_array(vals.real) + 23 * hash_array(vals.imag)

    # First, turn whatever array this is into unsigned 64-bit ints, if we can
    # manage it.
    if is_bool_array(vals):
        vals = vals.astype('u8')
    elif (is_datetime64_dtype(vals) or is_timedelta64_dtype(vals)):
        vals = vals.view('i8').astype('u8', copy=False)
    elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8):
        vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
    else:
        # With repeated values, its MUCH faster to categorize object dtypes,
        # then hash and rename categories. We allow skipping the categorization
        # when the values are known/likely to be unique.
        if categorize:
            codes, categories = factorize(vals, sort=False)
            cat = Categorical(codes,
                              Index(categories),
                              ordered=False,
                              fastpath=True)
            return _hash_categorical(cat, encoding, hash_key)

        try:
            vals = _hash.hash_object_array(vals, hash_key, encoding)
        except TypeError:
            # we have mixed types
            vals = _hash.hash_object_array(
                vals.astype(str).astype(object), hash_key, encoding)

    # Then, redistribute these 64-bit ints within the space of 64-bit ints
    vals ^= vals >> 30
    vals *= np.uint64(0xbf58476d1ce4e5b9)
    vals ^= vals >> 27
    vals *= np.uint64(0x94d049bb133111eb)
    vals ^= vals >> 31
    return vals
コード例 #11
0
    def _cython_operation(self,
                          kind: str,
                          values,
                          how: str,
                          axis: int,
                          min_count: int = -1,
                          **kwargs) -> ArrayLike:
        """
        Returns the values of a cython operation.
        """
        orig_values = values
        assert kind in ["transform", "aggregate"]

        if values.ndim > 2:
            raise NotImplementedError(
                "number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        cy_op = WrappedCythonOp(kind=kind, how=how)

        # can we do this operation with our cython functions
        # if not raise NotImplementedError
        cy_op.disallow_invalid_ops(dtype, is_numeric)

        if is_extension_array_dtype(dtype):
            return self._ea_wrap_cython_operation(kind, values, how, axis,
                                                  min_count, **kwargs)

        elif values.ndim == 1:
            # expand to 2d, dispatch, then squeeze if appropriate
            values2d = values[None, :]
            res = self._cython_operation(
                kind=kind,
                values=values2d,
                how=how,
                axis=1,
                min_count=min_count,
                **kwargs,
            )
            if res.shape[0] == 1:
                return res[0]

            # otherwise we have OHLC
            return res.T

        is_datetimelike = needs_i8_conversion(dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(dtype):
            values = values.astype("int64")
        elif is_integer_dtype(dtype):
            # e.g. uint8 -> uint64, int16 -> int64
            dtype = dtype.kind + "8"
            values = values.astype(dtype, copy=False)
        elif is_numeric:
            if not is_complex_dtype(dtype):
                values = ensure_float64(values)

        ngroups = self.ngroups
        comp_ids, _, _ = self.group_info

        assert axis == 1
        values = values.T

        out_shape = cy_op.get_output_shape(ngroups, values)
        func, values = cy_op.get_cython_func_and_vals(values, is_numeric)
        out_dtype = cy_op.get_out_dtype(values.dtype)

        result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
        if kind == "aggregate":
            counts = np.zeros(ngroups, dtype=np.int64)
            if how in ["min", "max"]:
                func(
                    result,
                    counts,
                    values,
                    comp_ids,
                    min_count,
                    is_datetimelike=is_datetimelike,
                )
            else:
                func(result, counts, values, comp_ids, min_count)
        elif kind == "transform":
            # TODO: min_count
            func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)

        if kind == "aggregate":
            # i.e. counts is defined.  Locations where count<min_count
            # need to have the result set to np.nan, which may require casting,
            # see GH#40767
            if is_integer_dtype(result.dtype) and not is_datetimelike:
                cutoff = max(1, min_count)
                empty_groups = counts < cutoff
                if empty_groups.any():
                    # Note: this conversion could be lossy, see GH#40767
                    result = result.astype("float64")
                    result[empty_groups] = np.nan

            if self._filter_empty_groups and not counts.all():
                assert result.ndim != 2
                result = result[counts > 0]

        result = result.T

        if how not in base.cython_cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cython_cast_blocklist we get here
            dtype = maybe_cast_result_dtype(orig_values.dtype, how)
            op_result = maybe_downcast_to_dtype(result, dtype)
        else:
            op_result = result

        return op_result
コード例 #12
0
ファイル: concat.py プロジェクト: ynorouzz/pandas
def _get_empty_dtype_and_na(join_units):
    """
    Return dtype and N/A values to use when concatenating specified units.

    Returned N/A value may be None which means there was no casting involved.

    Returns
    -------
    dtype
    na
    """
    if len(join_units) == 1:
        blk = join_units[0].block
        if blk is None:
            return np.dtype(np.float64), np.nan

    if _is_uniform_reindex(join_units):
        # FIXME: integrate property
        empty_dtype = join_units[0].block.dtype
        upcasted_na = join_units[0].block.fill_value
        return empty_dtype, upcasted_na

    has_none_blocks = False
    dtypes = [None] * len(join_units)
    for i, unit in enumerate(join_units):
        if unit.block is None:
            has_none_blocks = True
        else:
            dtypes[i] = unit.dtype

    upcast_classes = defaultdict(list)
    null_upcast_classes = defaultdict(list)
    for dtype, unit in zip(dtypes, join_units):
        if dtype is None:
            continue

        if is_categorical_dtype(dtype):
            upcast_cls = "category"
        elif is_datetime64tz_dtype(dtype):
            upcast_cls = "datetimetz"
        elif issubclass(dtype.type, np.bool_):
            upcast_cls = "bool"
        elif issubclass(dtype.type, np.object_):
            upcast_cls = "object"
        elif is_datetime64_dtype(dtype):
            upcast_cls = "datetime"
        elif is_timedelta64_dtype(dtype):
            upcast_cls = "timedelta"
        elif is_sparse(dtype):
            upcast_cls = dtype.subtype.name
        elif is_extension_array_dtype(dtype):
            upcast_cls = "object"
        elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
            upcast_cls = dtype.name
        else:
            upcast_cls = "float"

        # Null blocks should not influence upcast class selection, unless there
        # are only null blocks, when same upcasting rules must be applied to
        # null upcast classes.
        if unit.is_na:
            null_upcast_classes[upcast_cls].append(dtype)
        else:
            upcast_classes[upcast_cls].append(dtype)

    if not upcast_classes:
        upcast_classes = null_upcast_classes

    # TODO: de-duplicate with maybe_promote?
    # create the result
    if "object" in upcast_classes:
        return np.dtype(np.object_), np.nan
    elif "bool" in upcast_classes:
        if has_none_blocks:
            return np.dtype(np.object_), np.nan
        else:
            return np.dtype(np.bool_), None
    elif "category" in upcast_classes:
        return np.dtype(np.object_), np.nan
    elif "datetimetz" in upcast_classes:
        # GH-25014. We use NaT instead of iNaT, since this eventually
        # ends up in DatetimeArray.take, which does not allow iNaT.
        dtype = upcast_classes["datetimetz"]
        return dtype[0], NaT
    elif "datetime" in upcast_classes:
        return np.dtype("M8[ns]"), np.datetime64("NaT", "ns")
    elif "timedelta" in upcast_classes:
        return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns")
    else:  # pragma
        try:
            g = np.find_common_type(upcast_classes, [])
        except TypeError:
            # At least one is an ExtensionArray
            return np.dtype(np.object_), np.nan
        else:
            if is_float_dtype(g):
                return g, g.type(np.nan)
            elif is_numeric_dtype(g):
                if has_none_blocks:
                    return np.dtype(np.float64), np.nan
                else:
                    return g, None

    msg = "invalid dtype determination in get_concat_dtype"
    raise AssertionError(msg)
コード例 #13
0
 def is_numeric_mixed_type(self) -> bool:
     return all(is_numeric_dtype(t) for t in self.get_dtypes())
コード例 #14
0
ファイル: asserters.py プロジェクト: selasley/pandas
def assert_series_equal(
    left,
    right,
    check_dtype=True,
    check_index_type="equiv",
    check_series_type=True,
    check_less_precise=no_default,
    check_names=True,
    check_exact=False,
    check_datetimelike_compat=False,
    check_categorical=True,
    check_category_order=True,
    check_freq=True,
    check_flags=True,
    rtol=1.0e-5,
    atol=1.0e-8,
    obj="Series",
    *,
    check_index=True,
):
    """
    Check that left and right Series are equal.

    Parameters
    ----------
    left : Series
    right : Series
    check_dtype : bool, default True
        Whether to check the Series dtype is identical.
    check_index_type : bool or {'equiv'}, default 'equiv'
        Whether to check the Index class, dtype and inferred_type
        are identical.
    check_series_type : bool, default True
         Whether to check the Series class is identical.
    check_less_precise : bool or int, default False
        Specify comparison precision. Only used when check_exact is False.
        5 digits (False) or 3 digits (True) after decimal points are compared.
        If int, then specify the digits to compare.

        When comparing two numbers, if the first number has magnitude less
        than 1e-5, we compare the two numbers directly and check whether
        they are equivalent within the specified precision. Otherwise, we
        compare the **ratio** of the second number to the first number and
        check whether it is equivalent to 1 within the specified precision.

        .. deprecated:: 1.1.0
           Use `rtol` and `atol` instead to define relative/absolute
           tolerance, respectively. Similar to :func:`math.isclose`.
    check_names : bool, default True
        Whether to check the Series and Index names attribute.
    check_exact : bool, default False
        Whether to compare number exactly.
    check_datetimelike_compat : bool, default False
        Compare datetime-like which is comparable ignoring dtype.
    check_categorical : bool, default True
        Whether to compare internal Categorical exactly.
    check_category_order : bool, default True
        Whether to compare category order of internal Categoricals.

        .. versionadded:: 1.0.2
    check_freq : bool, default True
        Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex.

        .. versionadded:: 1.1.0
    check_flags : bool, default True
        Whether to check the `flags` attribute.

        .. versionadded:: 1.2.0

    rtol : float, default 1e-5
        Relative tolerance. Only used when check_exact is False.

        .. versionadded:: 1.1.0
    atol : float, default 1e-8
        Absolute tolerance. Only used when check_exact is False.

        .. versionadded:: 1.1.0
    obj : str, default 'Series'
        Specify object name being compared, internally used to show appropriate
        assertion message.
    check_index : bool, default True
        Whether to check index equivalence. If False, then compare only values.

        .. versionadded:: 1.3.0

    Examples
    --------
    >>> from pandas import testing as tm
    >>> a = pd.Series([1, 2, 3, 4])
    >>> b = pd.Series([1, 2, 3, 4])
    >>> tm.assert_series_equal(a, b)
    """
    __tracebackhide__ = True

    if check_less_precise is not no_default:
        warnings.warn(
            "The 'check_less_precise' keyword in testing.assert_*_equal "
            "is deprecated and will be removed in a future version. "
            "You can stop passing 'check_less_precise' to silence this warning.",
            FutureWarning,
            stacklevel=find_stack_level(),
        )
        rtol = atol = _get_tol_from_less_precise(check_less_precise)

    # instance validation
    _check_isinstance(left, right, Series)

    if check_series_type:
        assert_class_equal(left, right, obj=obj)

    # length comparison
    if len(left) != len(right):
        msg1 = f"{len(left)}, {left.index}"
        msg2 = f"{len(right)}, {right.index}"
        raise_assert_detail(obj, "Series length are different", msg1, msg2)

    if check_flags:
        assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}"

    if check_index:
        # GH #38183
        assert_index_equal(
            left.index,
            right.index,
            exact=check_index_type,
            check_names=check_names,
            check_exact=check_exact,
            check_categorical=check_categorical,
            rtol=rtol,
            atol=atol,
            obj=f"{obj}.index",
        )

    if check_freq and isinstance(left.index, (DatetimeIndex, TimedeltaIndex)):
        lidx = left.index
        ridx = right.index
        assert lidx.freq == ridx.freq, (lidx.freq, ridx.freq)

    if check_dtype:
        # We want to skip exact dtype checking when `check_categorical`
        # is False. We'll still raise if only one is a `Categorical`,
        # regardless of `check_categorical`
        if (isinstance(left.dtype, CategoricalDtype)
                and isinstance(right.dtype, CategoricalDtype)
                and not check_categorical):
            pass
        else:
            assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}")

    if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(
            right.dtype):
        left_values = left._values
        right_values = right._values
        # Only check exact if dtype is numeric
        if isinstance(left_values, ExtensionArray) and isinstance(
                right_values, ExtensionArray):
            assert_extension_array_equal(
                left_values,
                right_values,
                check_dtype=check_dtype,
                index_values=np.asarray(left.index),
            )
        else:
            assert_numpy_array_equal(
                left_values,
                right_values,
                check_dtype=check_dtype,
                obj=str(obj),
                index_values=np.asarray(left.index),
            )
    elif check_datetimelike_compat and (needs_i8_conversion(left.dtype)
                                        or needs_i8_conversion(right.dtype)):
        # we want to check only if we have compat dtypes
        # e.g. integer and M|m are NOT compat, but we can simply check
        # the values in that case

        # datetimelike may have different objects (e.g. datetime.datetime
        # vs Timestamp) but will compare equal
        if not Index(left._values).equals(Index(right._values)):
            msg = (f"[datetimelike_compat=True] {left._values} "
                   f"is not equal to {right._values}.")
            raise AssertionError(msg)
    elif is_interval_dtype(left.dtype) and is_interval_dtype(right.dtype):
        assert_interval_array_equal(left.array, right.array)
    elif isinstance(left.dtype, CategoricalDtype) or isinstance(
            right.dtype, CategoricalDtype):
        _testing.assert_almost_equal(
            left._values,
            right._values,
            rtol=rtol,
            atol=atol,
            check_dtype=check_dtype,
            obj=str(obj),
            index_values=np.asarray(left.index),
        )
    elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(
            right.dtype):
        assert_extension_array_equal(
            left._values,
            right._values,
            rtol=rtol,
            atol=atol,
            check_dtype=check_dtype,
            index_values=np.asarray(left.index),
        )
    elif is_extension_array_dtype_and_needs_i8_conversion(
            left.dtype,
            right.dtype) or is_extension_array_dtype_and_needs_i8_conversion(
                right.dtype, left.dtype):
        assert_extension_array_equal(
            left._values,
            right._values,
            check_dtype=check_dtype,
            index_values=np.asarray(left.index),
        )
    elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype):
        # DatetimeArray or TimedeltaArray
        assert_extension_array_equal(
            left._values,
            right._values,
            check_dtype=check_dtype,
            index_values=np.asarray(left.index),
        )
    else:
        _testing.assert_almost_equal(
            left._values,
            right._values,
            rtol=rtol,
            atol=atol,
            check_dtype=check_dtype,
            obj=str(obj),
            index_values=np.asarray(left.index),
        )

    # metadata comparison
    if check_names:
        assert_attr_equal("name", left, right, obj=obj)

    if check_categorical:
        if isinstance(left.dtype, CategoricalDtype) or isinstance(
                right.dtype, CategoricalDtype):
            assert_categorical_equal(
                left._values,
                right._values,
                obj=f"{obj} category",
                check_category_order=check_category_order,
            )
コード例 #15
0
def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
    """
    Return dtype and N/A values to use when concatenating specified units.

    Returned N/A value may be None which means there was no casting involved.

    Returns
    -------
    dtype
    """
    if len(join_units) == 1:
        blk = join_units[0].block
        if blk is None:
            return np.dtype(np.float64)

    if _is_uniform_reindex(join_units):
        # FIXME: integrate property
        empty_dtype = join_units[0].block.dtype
        return empty_dtype

    has_none_blocks = any(unit.block is None for unit in join_units)
    dtypes = [
        None if unit.block is None else unit.dtype for unit in join_units
    ]

    filtered_dtypes = [
        unit.dtype for unit in join_units
        if unit.block is not None and not unit.is_na
    ]
    if not len(filtered_dtypes):
        filtered_dtypes = [
            unit.dtype for unit in join_units if unit.block is not None
        ]
    dtype_alt = find_common_type(filtered_dtypes)

    upcast_classes = _get_upcast_classes(join_units, dtypes)

    if is_extension_array_dtype(dtype_alt):
        return dtype_alt
    elif dtype_alt == object:
        return dtype_alt

    # TODO: de-duplicate with maybe_promote?
    # create the result
    if "extension" in upcast_classes:
        return np.dtype("object")
    elif "bool" in upcast_classes:
        if has_none_blocks:
            return np.dtype(np.object_)
        else:
            return np.dtype(np.bool_)
    elif "datetimetz" in upcast_classes:
        # GH-25014. We use NaT instead of iNaT, since this eventually
        # ends up in DatetimeArray.take, which does not allow iNaT.
        dtype = upcast_classes["datetimetz"]
        return dtype[0]
    elif "datetime" in upcast_classes:
        return np.dtype("M8[ns]")
    elif "timedelta" in upcast_classes:
        return np.dtype("m8[ns]")
    else:
        try:
            common_dtype = np.find_common_type(upcast_classes, [])
        except TypeError:
            # At least one is an ExtensionArray
            return np.dtype(np.object_)
        else:
            if is_float_dtype(common_dtype):
                return common_dtype
            elif is_numeric_dtype(common_dtype):
                if has_none_blocks:
                    return np.dtype(np.float64)
                else:
                    return common_dtype

    msg = "invalid dtype determination in get_concat_dtype"
    raise AssertionError(msg)
コード例 #16
0
 def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
     # If we ever have BoolIndex or ComplexIndex, this may need to be tightened
     return is_numeric_dtype(dtype)
コード例 #17
0
    def _cython_operation(
        self,
        kind: str,
        values,
        how: str,
        axis: int,
        min_count: int = -1,
        mask: np.ndarray | None = None,
        **kwargs,
    ) -> ArrayLike:
        """
        Returns the values of a cython operation.
        """
        assert kind in ["transform", "aggregate"]

        if values.ndim > 2:
            raise NotImplementedError("number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        cy_op = WrappedCythonOp(kind=kind, how=how)

        # can we do this operation with our cython functions
        # if not raise NotImplementedError
        cy_op.disallow_invalid_ops(dtype, is_numeric)

        comp_ids, _, _ = self.group_info
        ngroups = self.ngroups

        func_uses_mask = cy_op.uses_mask()
        if is_extension_array_dtype(dtype):
            if isinstance(values, BaseMaskedArray) and func_uses_mask:
                return cy_op._masked_ea_wrap_cython_operation(
                    values,
                    min_count=min_count,
                    ngroups=ngroups,
                    comp_ids=comp_ids,
                    **kwargs,
                )
            else:
                return cy_op._ea_wrap_cython_operation(
                    values,
                    min_count=min_count,
                    ngroups=ngroups,
                    comp_ids=comp_ids,
                    **kwargs,
                )

        return cy_op._cython_op_ndim_compat(
            values,
            min_count=min_count,
            ngroups=self.ngroups,
            comp_ids=comp_ids,
            mask=mask,
            **kwargs,
        )
コード例 #18
0
    def _call_cython_op(
        self,
        values: np.ndarray,  # np.ndarray[ndim=2]
        *,
        min_count: int,
        ngroups: int,
        comp_ids: np.ndarray,
        mask: np.ndarray | None,
        **kwargs,
    ) -> np.ndarray:  # np.ndarray[ndim=2]
        orig_values = values

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        is_datetimelike = needs_i8_conversion(dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(dtype):
            values = values.astype("int64")
        elif is_integer_dtype(dtype):
            # e.g. uint8 -> uint64, int16 -> int64
            dtype_str = dtype.kind + "8"
            values = values.astype(dtype_str, copy=False)
        elif is_numeric:
            if not is_complex_dtype(dtype):
                values = ensure_float64(values)

        values = values.T

        if mask is not None:
            mask = mask.reshape(values.shape, order="C")

        out_shape = self.get_output_shape(ngroups, values)
        func, values = self.get_cython_func_and_vals(values, is_numeric)
        out_dtype = self.get_out_dtype(values.dtype)

        result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
        if self.kind == "aggregate":
            counts = np.zeros(ngroups, dtype=np.int64)
            if self.how in ["min", "max"]:
                func(
                    result,
                    counts,
                    values,
                    comp_ids,
                    min_count,
                    is_datetimelike=is_datetimelike,
                )
            else:
                func(result, counts, values, comp_ids, min_count)
        else:
            # TODO: min_count
            if self.uses_mask():
                func(
                    result,
                    values,
                    comp_ids,
                    ngroups,
                    is_datetimelike,
                    mask=mask,
                    **kwargs,
                )
            else:
                func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)

        if self.kind == "aggregate":
            # i.e. counts is defined.  Locations where count<min_count
            # need to have the result set to np.nan, which may require casting,
            # see GH#40767
            if is_integer_dtype(result.dtype) and not is_datetimelike:
                cutoff = max(1, min_count)
                empty_groups = counts < cutoff
                if empty_groups.any():
                    # Note: this conversion could be lossy, see GH#40767
                    result = result.astype("float64")
                    result[empty_groups] = np.nan

        result = result.T

        if self.how not in self.cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cast_blocklist we get here
            res_dtype = self.get_result_dtype(orig_values.dtype)
            # error: Argument 2 to "maybe_downcast_to_dtype" has incompatible type
            # "Union[dtype[Any], ExtensionDtype]"; expected "Union[str, dtype[Any]]"
            op_result = maybe_downcast_to_dtype(
                result, res_dtype  # type: ignore[arg-type]
            )
        else:
            op_result = result

        # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]",
        # expected "ndarray")
        return op_result  # type: ignore[return-value]
コード例 #19
0
    def plot_all(self,
                 row=None,
                 cols=None,
                 xlabel="",
                 ylabel="",
                 legend="",
                 cont_colors=False,
                 plot_filter_pos="",
                 plot_filter_neg="",
                 save=False):
        figr = plt.figure()
        if row is None:
            row = self.x
        if cols is None:
            cols = self.ys

        good_cols = []
        text_cols = []
        for col in cols:
            if plot_filter_neg and col.startswith(plot_filter_neg):
                continue
            if plot_filter_pos and not col.startswith(plot_filter_pos):
                continue
            if is_numeric_dtype(self.dataframe[col]):
                good_cols.append(col)
            else:
                try:
                    if "+-" in self.dataframe[col][0]:
                        good_cols.append(col)
                except:
                    pass

        for col in cols:
            if (col not in good_cols):
                try:
                    if isinstance(self.dataframe[col][0],
                                  list) and None in self.dataframe[col][0]:
                        continue
                    if is_numeric_dtype(self.dataframe[col]
                                        ) or "+-" in self.dataframe[col][0]:
                        continue
                    text_cols.append(col)
                except:
                    pass

        self.dataframe = self.dataframe.sort_values(by=text_cols,
                                                    ascending=True)

        sidex = sidey = int(np.sqrt(len(good_cols) + 1))
        while sidex * sidey < len(good_cols) + 1:
            # sidex += int((len(good_cols) - sidex ** 2) / sidex) + 1
            sidex += 1

        fig, axs = plt.subplots(sidex, sidey)
        # fig.subplots_adjust(bottom=0.2)
        fig.set_size_inches(25, 14)
        axes = axs.ravel()

        if cont_colors:  # len(self.dataframe[col]) >= 15:
            plt.rcParams["axes.prop_cycle"] = plt.cycler(
                "color",
                plt.cm.viridis(np.linspace(0, 1, len(self.dataframe[col]))))

        for i, col in enumerate(good_cols):
            ax = axes[i]
            leg = legend if legend else col
            col = self.dataframe[col]

            if is_numeric_dtype(col):
                self.normal_plot(col,
                                 row,
                                 ax,
                                 xlabel=xlabel,
                                 ylabel=ylabel,
                                 legend=leg)
            else:
                try:
                    self.conf_plot(col,
                                   row,
                                   ax,
                                   xlabel=xlabel,
                                   ylabel=ylabel,
                                   legend=leg)
                except:
                    pass

        while i < len(axes) - 1:
            i += 1
            axes[i].set_axis_off()

        leg = self.global_legend(text_cols, fig, axes[len(good_cols)])
        plt.gcf().tight_layout()

        if save:
            fig.savefig('./img/samplefigure',
                        bbox_extra_artists=[leg],
                        bbox_inches='tight')

        plt.show()
コード例 #20
0
ファイル: ops.py プロジェクト: zeneli/pandas
    def _cython_operation(self,
                          kind: str,
                          values,
                          how: str,
                          axis,
                          min_count: int = -1,
                          **kwargs) -> Tuple[np.ndarray, Optional[List[str]]]:
        """
        Returns the values of a cython operation as a Tuple of [data, names].

        Names is only useful when dealing with 2D results, like ohlc
        (see self._name_functions).
        """

        assert kind in ["transform", "aggregate"]
        orig_values = values

        if values.ndim > 2:
            raise NotImplementedError(
                "number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values) or is_sparse(values):
            raise NotImplementedError(f"{values.dtype} dtype not supported")
        elif is_datetime64_any_dtype(values):
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise NotImplementedError(
                    f"datetime64 type does not support {how} operations")
        elif is_timedelta64_dtype(values):
            if how in ["prod", "cumprod"]:
                raise NotImplementedError(
                    f"timedelta64 type does not support {how} operations")

        if is_datetime64tz_dtype(values.dtype):
            # Cast to naive; we'll cast back at the end of the function
            # TODO: possible need to reshape?  kludge can be avoided when
            #  2D EA is allowed.
            values = values.view("M8[ns]")

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups, ) + values.shape[1:]

        func, values = self._get_cython_func_and_vals(kind, how, values,
                                                      is_numeric)

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}"
            else:
                out_dtype = "object"

        codes, _, _ = self.group_info

        if kind == "aggregate":
            result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                 fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, codes, func,
                                     is_datetimelike, min_count)
        elif kind == "transform":
            result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                 fill_value=np.nan)

            # TODO: min_count
            result = self._transform(result, values, codes, func,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all(
        ):
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        names: Optional[List[str]] = self._name_functions.get(how, None)

        if swapped:
            result = result.swapaxes(0, axis)

        if is_datetime64tz_dtype(orig_values.dtype):
            result = type(orig_values)(result.astype(np.int64),
                                       dtype=orig_values.dtype)
        elif is_datetimelike and kind == "aggregate":
            result = result.astype(orig_values.dtype)

        return result, names
コード例 #21
0
ファイル: cast.py プロジェクト: danielmoreira12/BAProject
def convert_dtypes(
    input_array,
    convert_string: bool = True,
    convert_integer: bool = True,
    convert_boolean: bool = True,
) -> Dtype:
    """
    Convert objects to best possible type, and optionally,
    to types supporting ``pd.NA``.

    Parameters
    ----------
    input_array : ExtensionArray or PandasArray
    convert_string : bool, default True
        Whether object dtypes should be converted to ``StringDtype()``.
    convert_integer : bool, default True
        Whether, if possible, conversion can be done to integer extension types.
    convert_boolean : bool, defaults True
        Whether object dtypes should be converted to ``BooleanDtypes()``.

    Returns
    -------
    dtype
        new dtype
    """

    if convert_string or convert_integer or convert_boolean:
        try:
            inferred_dtype = lib.infer_dtype(input_array)
        except ValueError:
            # Required to catch due to Period.  Can remove once GH 23553 is fixed
            inferred_dtype = input_array.dtype

        if not convert_string and is_string_dtype(inferred_dtype):
            inferred_dtype = input_array.dtype

        if convert_integer:
            target_int_dtype = "Int64"

            if isinstance(inferred_dtype,
                          str) and (inferred_dtype == "mixed-integer" or
                                    inferred_dtype == "mixed-integer-float"):
                inferred_dtype = target_int_dtype
            if is_integer_dtype(
                    input_array.dtype) and not is_extension_array_dtype(
                        input_array.dtype):
                from pandas.core.arrays.integer import _dtypes

                inferred_dtype = _dtypes.get(input_array.dtype.name,
                                             target_int_dtype)
            if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
                    input_array.dtype):
                inferred_dtype = target_int_dtype

        else:
            if is_integer_dtype(inferred_dtype):
                inferred_dtype = input_array.dtype

        if convert_boolean:
            if is_bool_dtype(
                    input_array.dtype) and not is_extension_array_dtype(
                        input_array.dtype):
                inferred_dtype = "boolean"
        else:
            if isinstance(inferred_dtype, str) and inferred_dtype == "boolean":
                inferred_dtype = input_array.dtype

    else:
        inferred_dtype = input_array.dtype

    return inferred_dtype
コード例 #22
0
def _get_empty_dtype_and_na(
        join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, Any]:
    """
    Return dtype and N/A values to use when concatenating specified units.

    Returned N/A value may be None which means there was no casting involved.

    Returns
    -------
    dtype
    na
    """
    if len(join_units) == 1:
        blk = join_units[0].block
        if blk is None:
            return np.dtype(np.float64), np.nan

    if _is_uniform_reindex(join_units):
        # FIXME: integrate property
        empty_dtype = join_units[0].block.dtype
        upcasted_na = join_units[0].block.fill_value
        return empty_dtype, upcasted_na

    has_none_blocks = False
    dtypes = [None] * len(join_units)
    for i, unit in enumerate(join_units):
        if unit.block is None:
            has_none_blocks = True
        else:
            dtypes[i] = unit.dtype

    upcast_classes = _get_upcast_classes(join_units, dtypes)

    # TODO: de-duplicate with maybe_promote?
    # create the result
    if "extension" in upcast_classes:
        if len(upcast_classes) == 1:
            cls = upcast_classes["extension"][0]
            return cls, cls.na_value
        else:
            return np.dtype("object"), np.nan
    elif "object" in upcast_classes:
        return np.dtype(np.object_), np.nan
    elif "bool" in upcast_classes:
        if has_none_blocks:
            return np.dtype(np.object_), np.nan
        else:
            return np.dtype(np.bool_), None
    elif "category" in upcast_classes:
        return np.dtype(np.object_), np.nan
    elif "datetimetz" in upcast_classes:
        # GH-25014. We use NaT instead of iNaT, since this eventually
        # ends up in DatetimeArray.take, which does not allow iNaT.
        dtype = upcast_classes["datetimetz"]
        return dtype[0], NaT
    elif "datetime" in upcast_classes:
        return np.dtype("M8[ns]"), np.datetime64("NaT", "ns")
    elif "timedelta" in upcast_classes:
        return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns")
    else:
        try:
            common_dtype = np.find_common_type(upcast_classes, [])
        except TypeError:
            # At least one is an ExtensionArray
            return np.dtype(np.object_), np.nan
        else:
            if is_float_dtype(common_dtype):
                return common_dtype, common_dtype.type(np.nan)
            elif is_numeric_dtype(common_dtype):
                if has_none_blocks:
                    return np.dtype(np.float64), np.nan
                else:
                    return common_dtype, None

    msg = "invalid dtype determination in get_concat_dtype"
    raise AssertionError(msg)
コード例 #23
0
    def _cython_operation(
        self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
    ) -> np.ndarray:
        """
        Returns the values of a cython operation.
        """
        orig_values = values
        assert kind in ["transform", "aggregate"]

        if values.ndim > 2:
            raise NotImplementedError("number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        # can we do this operation with our cython functions
        # if not raise NotImplementedError
        self._disallow_invalid_ops(dtype, how, is_numeric)

        if is_extension_array_dtype(dtype):
            return self._ea_wrap_cython_operation(
                kind, values, how, axis, min_count, **kwargs
            )

        is_datetimelike = needs_i8_conversion(dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(dtype):
            values = ensure_int_or_float(values)
        elif is_integer_dtype(dtype):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(dtype):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups,) + values.shape[1:]

        func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric)

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}"
            else:
                out_dtype = "object"

        codes, _, _ = self.group_info

        if kind == "aggregate":
            result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, codes, func, min_count)
        elif kind == "transform":
            result = maybe_fill(np.empty(values.shape, dtype=out_dtype))

            # TODO: min_count
            result = self._transform(
                result, values, codes, func, is_datetimelike, **kwargs
            )

        if is_integer_dtype(result.dtype) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all():
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if swapped:
            result = result.swapaxes(0, axis)

        if how not in base.cython_cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cython_cast_blocklist we get here
            dtype = maybe_cast_result_dtype(orig_values.dtype, how)
            # error: Incompatible types in assignment (expression has type
            # "Union[ExtensionArray, ndarray]", variable has type "ndarray")
            result = maybe_downcast_to_dtype(result, dtype)  # type: ignore[assignment]

        return result
コード例 #24
0
 def _can_union_without_object_cast(self, other) -> bool:
     # See GH#26778, further casting may occur in NumericIndex._union
     return is_numeric_dtype(other.dtype)
コード例 #25
0
ファイル: concat.py プロジェクト: sweb/pandas
def get_empty_dtype_and_na(join_units):
    """
    Return dtype and N/A values to use when concatenating specified units.

    Returned N/A value may be None which means there was no casting involved.

    Returns
    -------
    dtype
    na
    """
    if len(join_units) == 1:
        blk = join_units[0].block
        if blk is None:
            return np.float64, np.nan

    if is_uniform_reindex(join_units):
        # XXX: integrate property
        empty_dtype = join_units[0].block.dtype
        upcasted_na = join_units[0].block.fill_value
        return empty_dtype, upcasted_na

    has_none_blocks = False
    dtypes = [None] * len(join_units)
    for i, unit in enumerate(join_units):
        if unit.block is None:
            has_none_blocks = True
        else:
            dtypes[i] = unit.dtype

    upcast_classes = defaultdict(list)
    null_upcast_classes = defaultdict(list)
    for dtype, unit in zip(dtypes, join_units):
        if dtype is None:
            continue

        if is_categorical_dtype(dtype):
            upcast_cls = 'category'
        elif is_datetime64tz_dtype(dtype):
            upcast_cls = 'datetimetz'
        elif issubclass(dtype.type, np.bool_):
            upcast_cls = 'bool'
        elif issubclass(dtype.type, np.object_):
            upcast_cls = 'object'
        elif is_datetime64_dtype(dtype):
            upcast_cls = 'datetime'
        elif is_timedelta64_dtype(dtype):
            upcast_cls = 'timedelta'
        elif is_sparse(dtype):
            upcast_cls = dtype.subtype.name
        elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
            upcast_cls = dtype.name
        else:
            upcast_cls = 'float'

        # Null blocks should not influence upcast class selection, unless there
        # are only null blocks, when same upcasting rules must be applied to
        # null upcast classes.
        if unit.is_na:
            null_upcast_classes[upcast_cls].append(dtype)
        else:
            upcast_classes[upcast_cls].append(dtype)

    if not upcast_classes:
        upcast_classes = null_upcast_classes

    # create the result
    if 'object' in upcast_classes:
        return np.dtype(np.object_), np.nan
    elif 'bool' in upcast_classes:
        if has_none_blocks:
            return np.dtype(np.object_), np.nan
        else:
            return np.dtype(np.bool_), None
    elif 'category' in upcast_classes:
        return np.dtype(np.object_), np.nan
    elif 'datetimetz' in upcast_classes:
        dtype = upcast_classes['datetimetz']
        return dtype[0], tslibs.iNaT
    elif 'datetime' in upcast_classes:
        return np.dtype('M8[ns]'), tslibs.iNaT
    elif 'timedelta' in upcast_classes:
        return np.dtype('m8[ns]'), tslibs.iNaT
    else:  # pragma
        try:
            g = np.find_common_type(upcast_classes, [])
        except TypeError:
            # At least one is an ExtensionArray
            return np.dtype(np.object_), np.nan
        else:
            if is_float_dtype(g):
                return g, g.type(np.nan)
            elif is_numeric_dtype(g):
                if has_none_blocks:
                    return np.float64, np.nan
                else:
                    return g, None

    msg = "invalid dtype determination in get_concat_dtype"
    raise AssertionError(msg)
コード例 #26
0
    def _cython_operation(self,
                          kind,
                          values,
                          how,
                          axis,
                          min_count=-1,
                          **kwargs):
        assert kind in ['transform', 'aggregate']

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values):
            raise NotImplementedError(
                "categoricals are not support in cython ops ATM")
        elif is_datetime64_any_dtype(values):
            if how in ['add', 'prod', 'cumsum', 'cumprod']:
                raise NotImplementedError(
                    "datetime64 type does not support {} "
                    "operations".format(how))
        elif is_timedelta64_dtype(values):
            if how in ['prod', 'cumprod']:
                raise NotImplementedError(
                    "timedelta64 type does not support {} "
                    "operations".format(how))

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                values = values.swapaxes(0, axis)
            if arity > 1:
                raise NotImplementedError("arity of more than 1 is not "
                                          "supported for the 'how' argument")
            out_shape = (self.ngroups, ) + values.shape[1:]

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view('int64')
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int64_or_float64(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        try:
            func = self._get_cython_function(kind, how, values, is_numeric)
        except NotImplementedError:
            if is_numeric:
                values = ensure_float64(values)
                func = self._get_cython_function(kind, how, values, is_numeric)
            else:
                raise

        if how == 'rank':
            out_dtype = 'float'
        else:
            if is_numeric:
                out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize)
            else:
                out_dtype = 'object'

        labels, _, _ = self.group_info

        if kind == 'aggregate':
            result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                 fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, labels, func,
                                     is_numeric, is_datetimelike, min_count)
        elif kind == 'transform':
            result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                 fill_value=np.nan)

            # TODO: min_count
            result = self._transform(result, values, labels, func, is_numeric,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype('float64')
                result[mask] = np.nan

        if (kind == 'aggregate' and self._filter_empty_groups
                and not counts.all()):
            if result.ndim == 2:
                try:
                    result = lib.row_bool_subset(result,
                                                 (counts > 0).view(np.uint8))
                except ValueError:
                    result = lib.row_bool_subset_object(
                        ensure_object(result), (counts > 0).view(np.uint8))
            else:
                result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if how in self._name_functions:
            # TODO
            names = self._name_functions[how]()
        else:
            names = None

        if swapped:
            result = result.swapaxes(0, axis)

        return result, names
コード例 #27
0
ファイル: ops.py プロジェクト: huaxz1986/pandas
    def _call_cython_op(
        self,
        values: np.ndarray,  # np.ndarray[ndim=2]
        *,
        min_count: int,
        ngroups: int,
        comp_ids: np.ndarray,
        mask: np.ndarray | None,
        result_mask: np.ndarray | None,
        **kwargs,
    ) -> np.ndarray:  # np.ndarray[ndim=2]
        orig_values = values

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        is_datetimelike = needs_i8_conversion(dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(dtype):
            values = values.astype("int64")
        elif is_integer_dtype(dtype):
            # GH#43329 If the dtype is explicitly of type uint64 the type is not
            # changed to prevent overflow.
            if dtype != np.uint64:
                values = values.astype(np.int64, copy=False)
        elif is_numeric:
            if not is_complex_dtype(dtype):
                values = ensure_float64(values)

        values = values.T
        if mask is not None:
            mask = mask.T
            if result_mask is not None:
                result_mask = result_mask.T

        out_shape = self._get_output_shape(ngroups, values)
        func, values = self.get_cython_func_and_vals(values, is_numeric)
        out_dtype = self.get_out_dtype(values.dtype)

        result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
        if self.kind == "aggregate":
            counts = np.zeros(ngroups, dtype=np.int64)
            if self.how in ["min", "max", "mean"]:
                func(
                    result,
                    counts,
                    values,
                    comp_ids,
                    min_count,
                    mask=mask,
                    result_mask=result_mask,
                    is_datetimelike=is_datetimelike,
                )
            elif self.how in ["add"]:
                # We support datetimelike
                func(
                    result,
                    counts,
                    values,
                    comp_ids,
                    min_count,
                    datetimelike=is_datetimelike,
                )
            else:
                func(result, counts, values, comp_ids, min_count)
        else:
            # TODO: min_count
            if self.uses_mask():
                func(
                    result,
                    values,
                    comp_ids,
                    ngroups,
                    is_datetimelike,
                    mask=mask,
                    **kwargs,
                )
            else:
                func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)

        if self.kind == "aggregate":
            # i.e. counts is defined.  Locations where count<min_count
            # need to have the result set to np.nan, which may require casting,
            # see GH#40767
            if is_integer_dtype(result.dtype) and not is_datetimelike:
                cutoff = max(1, min_count)
                empty_groups = counts < cutoff
                if empty_groups.any():
                    # Note: this conversion could be lossy, see GH#40767
                    result = result.astype("float64")
                    result[empty_groups] = np.nan

        result = result.T

        if self.how not in self.cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cast_blocklist we get here
            res_dtype = self._get_result_dtype(orig_values.dtype)
            op_result = maybe_downcast_to_dtype(result, res_dtype)
        else:
            op_result = result

        # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]",
        # expected "ndarray")
        return op_result  # type: ignore[return-value]
コード例 #28
0
ファイル: pandas.py プロジェクト: YoannT/nlstruct
def normalize_vocabularies(dfs,
                           vocabularies=None,
                           train_vocabularies=True,
                           unk=None,
                           verbose=0):
    """
    Categorize the columns of the dataframes so that they share the same
    categories if they share the same columns
    If a column's name ends up with '_id', do not categorize it since it is no something we want to train on

    Parameters
    ----------
    dfs: list of pd.DataFrame
        DataFrame whose columns will be categorized
    vocabularies: dict or None
        Existing vocabulary to use if any
    train_vocabularies: bool or dict of (str, bool)
        Which category to extend/create in the voc ?
    unk: dict of (str, any)
        Which filler should we put for an unknown object if we cannot train the corresponding voc ?
    verbose: int

    Returns
    -------
    list of pd.DataFrame, dict
    """
    # Define label vocabulary
    if unk is None:
        unk = {}
    if vocabularies is None:
        vocabularies = {}
    voc_order = list(vocabularies.keys())

    if train_vocabularies is False:
        train_vocabularies = defaultdict(lambda: False)
    else:
        train_vocabularies_ = defaultdict(lambda: True)
        if isinstance(train_vocabularies, dict):
            train_vocabularies_.update(train_vocabularies)
        train_vocabularies = train_vocabularies_
        del train_vocabularies_

    for col_name in vocabularies:
        if col_name not in train_vocabularies:
            train_vocabularies[col_name] = False

    for df in dfs:
        for col_name in df:
            if not col_name.endswith('_id') and not is_numeric_dtype(
                    df[col_name].dtype):
                if train_vocabularies[col_name]:
                    train_vocabularies[col_name] = True
                else:
                    train_vocabularies[col_name] = False
    for col_name, will_train in train_vocabularies.items():
        if will_train and verbose:
            print(f"Will train vocabulary for {col_name}")
    for df in dfs:
        for col_name in df:
            if hasattr(
                    df[col_name], 'cat'
            ) and col_name not in vocabularies and not col_name.endswith(
                    '_id'):
                if verbose:
                    print(
                        f"Discovered existing vocabulary ({len(df[col_name].cat.categories)} entities) for {col_name}"
                    )
                vocabularies[col_name] = list(df[col_name].dtype.categories)
    for voc_name, train_voc in train_vocabularies.items():
        if train_voc:
            voc = list(vocabularies.get(voc_name, []))
            if voc_name in unk and unk[voc_name] not in voc:
                voc.append(unk[voc_name])
            if hasattr(voc, 'categories'):
                voc = list(voc.categories)
            for df in dfs:
                if voc_name in df:
                    voc.extend(df[voc_name].astype("category").cat.categories)
            voc = pd.factorize(voc)[1]
            dtype = pd.CategoricalDtype(pd.factorize(voc)[1])
            for df in dfs:
                if voc_name in df:
                    df[voc_name] = df[voc_name].astype(dtype)
                    vocabularies[voc_name] = voc
                    if voc_name in unk:
                        df[voc_name].fillna(unk[voc_name], inplace=True)
        else:
            voc = vocabularies.get(voc_name)
            if not hasattr(voc, 'categories'):
                voc = pd.CategoricalDtype(voc)
            for df in dfs:
                if voc_name in df:
                    df[voc_name] = df[voc_name].astype(voc)
                    if verbose:
                        unk_msg = f"unk {unk[voc_name]}" if voc_name in unk else "no unk"
                        print(
                            f"Normalized {voc_name}, with given vocabulary and {unk_msg}"
                        )
                    if voc_name in unk:
                        df[voc_name].fillna(unk[voc_name], inplace=True)

    # Reorder vocabularies to keep same order as the vocabulary passed in parameters
    vocabularies = dict(
        (*((c, vocabularies[c]) for c in voc_order if c in vocabularies),
         *((c, vocabularies[c]) for c in vocabularies if c not in voc_order)))

    # Reorder dataframes according to vocabulary order
    dfs = [
        df[[
            *(c for c in vocabularies if c in df.columns),
            *(c for c in df.columns if c not in vocabularies)
        ]] for df in dfs
    ]
    return dfs, vocabularies
コード例 #29
0
    def _cython_operation(self,
                          kind,
                          values,
                          how,
                          axis,
                          min_count=-1,
                          **kwargs):
        assert kind in ["transform", "aggregate"]

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values) or is_sparse(values):
            raise NotImplementedError(
                "{} are not support in cython ops".format(values.dtype))
        elif is_datetime64_any_dtype(values):
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise NotImplementedError(
                    "datetime64 type does not support {} "
                    "operations".format(how))
        elif is_timedelta64_dtype(values):
            if how in ["prod", "cumprod"]:
                raise NotImplementedError(
                    "timedelta64 type does not support {} "
                    "operations".format(how))

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError("arity of more than 1 is not "
                                          "supported for the 'how' argument")
            out_shape = (self.ngroups, ) + values.shape[1:]

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        try:
            func = self._get_cython_function(kind, how, values, is_numeric)
        except NotImplementedError:
            if is_numeric:
                values = ensure_float64(values)
                func = self._get_cython_function(kind, how, values, is_numeric)
            else:
                raise

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = "{kind}{itemsize}".format(
                    kind=values.dtype.kind, itemsize=values.dtype.itemsize)
            else:
                out_dtype = "object"

        labels, _, _ = self.group_info

        if kind == "aggregate":
            result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                 fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(
                result,
                counts,
                values,
                labels,
                func,
                is_numeric,
                is_datetimelike,
                min_count,
            )
        elif kind == "transform":
            result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                 fill_value=np.nan)

            # TODO: min_count
            result = self._transform(result, values, labels, func, is_numeric,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all(
        ):
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if how in self._name_functions:
            # TODO
            names = self._name_functions[how]()
        else:
            names = None

        if swapped:
            result = result.swapaxes(0, axis)

        return result, names
コード例 #30
0
def _adjust_to_origin(arg, origin, unit):
    """
    Helper function for to_datetime.
    Adjust input argument to the specified origin

    Parameters
    ----------
    arg : list, tuple, ndarray, Series, Index
        date to be adjusted
    origin : 'julian' or Timestamp
        origin offset for the arg
    unit : string
        passed unit from to_datetime, must be 'D'

    Returns
    -------
    ndarray or scalar of adjusted date(s)
    """
    if origin == "julian":
        original = arg
        j0 = Timestamp(0).to_julian_date()
        if unit != "D":
            raise ValueError("unit must be 'D' for origin='julian'")
        try:
            arg = arg - j0
        except TypeError:
            raise ValueError(
                "incompatible 'arg' type for given 'origin'='julian'")

        # preemptively check this for a nice range
        j_max = Timestamp.max.to_julian_date() - j0
        j_min = Timestamp.min.to_julian_date() - j0
        if np.any(arg > j_max) or np.any(arg < j_min):
            raise tslibs.OutOfBoundsDatetime(
                f"{original} is Out of Bounds for origin='julian'")
    else:
        # arg must be numeric
        if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg)))
                or is_numeric_dtype(np.asarray(arg))):
            raise ValueError(
                f"'{arg}' is not compatible with origin='{origin}'; "
                "it must be numeric with a unit specified")

        # we are going to offset back to unix / epoch time
        try:
            offset = Timestamp(origin)
        except tslibs.OutOfBoundsDatetime:
            raise tslibs.OutOfBoundsDatetime(
                f"origin {origin} is Out of Bounds")
        except ValueError:
            raise ValueError(
                f"origin {origin} cannot be converted to a Timestamp")

        if offset.tz is not None:
            raise ValueError(f"origin offset {offset} must be tz-naive")
        offset -= Timestamp(0)

        # convert the offset to the unit of the arg
        # this should be lossless in terms of precision
        offset = offset // tslibs.Timedelta(1, unit=unit)

        # scalars & ndarray-like can handle the addition
        if is_list_like(arg) and not isinstance(
                arg, (ABCSeries, ABCIndexClass, np.ndarray)):
            arg = np.asarray(arg)
        arg = arg + offset
    return arg
コード例 #31
0
ファイル: numeric.py プロジェクト: zahirakkouche/pandas
def to_numeric(arg, errors="raise", downcast=None):
    """
    Convert argument to a numeric type.

    The default return dtype is `float64` or `int64`
    depending on the data supplied. Use the `downcast` parameter
    to obtain other dtypes.

    Please note that precision loss may occur if really large numbers
    are passed in. Due to the internal limitations of `ndarray`, if
    numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
    or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
    passed in, it is very likely they will be converted to float so that
    they can stored in an `ndarray`. These warnings apply similarly to
    `Series` since it internally leverages `ndarray`.

    Parameters
    ----------
    arg : scalar, list, tuple, 1-d array, or Series
        Argument to be converted.
    errors : {'ignore', 'raise', 'coerce'}, default 'raise'
        - If 'raise', then invalid parsing will raise an exception.
        - If 'coerce', then invalid parsing will be set as NaN.
        - If 'ignore', then invalid parsing will return the input.
    downcast : {'integer', 'signed', 'unsigned', 'float'}, default None
        If not None, and if the data has been successfully cast to a
        numerical dtype (or if the data was numeric to begin with),
        downcast that resulting data to the smallest numerical dtype
        possible according to the following rules:

        - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
        - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
        - 'float': smallest float dtype (min.: np.float32)

        As this behaviour is separate from the core conversion to
        numeric values, any errors raised during the downcasting
        will be surfaced regardless of the value of the 'errors' input.

        In addition, downcasting will only occur if the size
        of the resulting data's dtype is strictly larger than
        the dtype it is to be cast to, so if none of the dtypes
        checked satisfy that specification, no downcasting will be
        performed on the data.

    Returns
    -------
    ret
        Numeric if parsing succeeded.
        Return type depends on input.  Series if Series, otherwise ndarray.

    See Also
    --------
    DataFrame.astype : Cast argument to a specified dtype.
    to_datetime : Convert argument to datetime.
    to_timedelta : Convert argument to timedelta.
    numpy.ndarray.astype : Cast a numpy array to a specified type.
    DataFrame.convert_dtypes : Convert dtypes.

    Examples
    --------
    Take separate series and convert to numeric, coercing when told to

    >>> s = pd.Series(['1.0', '2', -3])
    >>> pd.to_numeric(s)
    0    1.0
    1    2.0
    2   -3.0
    dtype: float64
    >>> pd.to_numeric(s, downcast='float')
    0    1.0
    1    2.0
    2   -3.0
    dtype: float32
    >>> pd.to_numeric(s, downcast='signed')
    0    1
    1    2
    2   -3
    dtype: int8
    >>> s = pd.Series(['apple', '1.0', '2', -3])
    >>> pd.to_numeric(s, errors='ignore')
    0    apple
    1      1.0
    2        2
    3       -3
    dtype: object
    >>> pd.to_numeric(s, errors='coerce')
    0    NaN
    1    1.0
    2    2.0
    3   -3.0
    dtype: float64
    """
    if downcast not in (None, "integer", "signed", "unsigned", "float"):
        raise ValueError("invalid downcasting method provided")

    if errors not in ("ignore", "raise", "coerce"):
        raise ValueError("invalid error value specified")

    is_series = False
    is_index = False
    is_scalars = False

    if isinstance(arg, ABCSeries):
        is_series = True
        values = arg.values
    elif isinstance(arg, ABCIndex):
        is_index = True
        if needs_i8_conversion(arg.dtype):
            values = arg.asi8
        else:
            values = arg.values
    elif isinstance(arg, (list, tuple)):
        values = np.array(arg, dtype="O")
    elif is_scalar(arg):
        if is_decimal(arg):
            return float(arg)
        if is_number(arg):
            return arg
        is_scalars = True
        values = np.array([arg], dtype="O")
    elif getattr(arg, "ndim", 1) > 1:
        raise TypeError("arg must be a list, tuple, 1-d array, or Series")
    else:
        values = arg

    values_dtype = getattr(values, "dtype", None)
    if is_numeric_dtype(values_dtype):
        pass
    elif is_datetime_or_timedelta_dtype(values_dtype):
        values = values.astype(np.int64)
    else:
        values = ensure_object(values)
        coerce_numeric = errors not in ("ignore", "raise")
        try:
            values = lib.maybe_convert_numeric(values,
                                               set(),
                                               coerce_numeric=coerce_numeric)
        except (ValueError, TypeError):
            if errors == "raise":
                raise

    # attempt downcast only if the data has been successfully converted
    # to a numerical dtype and if a downcast method has been specified
    if downcast is not None and is_numeric_dtype(values.dtype):
        typecodes = None

        if downcast in ("integer", "signed"):
            typecodes = np.typecodes["Integer"]
        elif downcast == "unsigned" and (not len(values)
                                         or np.min(values) >= 0):
            typecodes = np.typecodes["UnsignedInteger"]
        elif downcast == "float":
            typecodes = np.typecodes["Float"]

            # pandas support goes only to np.float32,
            # as float dtypes smaller than that are
            # extremely rare and not well supported
            float_32_char = np.dtype(np.float32).char
            float_32_ind = typecodes.index(float_32_char)
            typecodes = typecodes[float_32_ind:]

        if typecodes is not None:
            # from smallest to largest
            for dtype in typecodes:
                dtype = np.dtype(dtype)
                if dtype.itemsize <= values.dtype.itemsize:
                    values = maybe_downcast_numeric(values, dtype)

                    # successful conversion
                    if values.dtype == dtype:
                        break

    if is_series:
        return arg._constructor(values, index=arg.index, name=arg.name)
    elif is_index:
        # because we want to coerce to numeric if possible,
        # do not use _shallow_copy
        return pd.Index(values, name=arg.name)
    elif is_scalars:
        return values[0]
    else:
        return values
コード例 #32
0
ファイル: hashing.py プロジェクト: ankostis/pandas
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
    """
    Given a 1d array, return an array of deterministic integers.

    .. versionadded:: 0.19.2

    Parameters
    ----------
    vals : ndarray, Categorical
    encoding : string, default 'utf8'
        encoding for data & key when strings
    hash_key : string key to encode, default to _default_hash_key
    categorize : bool, default True
        Whether to first categorize object arrays before hashing. This is more
        efficient when the array contains duplicate values.

        .. versionadded:: 0.20.0

    Returns
    -------
    1d uint64 numpy array of hash values, same length as the vals

    """

    if not hasattr(vals, 'dtype'):
        raise TypeError("must pass a ndarray-like")

    if hash_key is None:
        hash_key = _default_hash_key

    # For categoricals, we hash the categories, then remap the codes to the
    # hash values. (This check is above the complex check so that we don't ask
    # numpy if categorical is a subdtype of complex, as it will choke.
    if is_categorical_dtype(vals.dtype):
        return _hash_categorical(vals, encoding, hash_key)

    # we'll be working with everything as 64-bit values, so handle this
    # 128-bit value early
    if np.issubdtype(vals.dtype, np.complex128):
        return hash_array(vals.real) + 23 * hash_array(vals.imag)

    # First, turn whatever array this is into unsigned 64-bit ints, if we can
    # manage it.
    if is_bool_array(vals):
        vals = vals.astype('u8')
    elif (is_datetime64_dtype(vals) or
          is_timedelta64_dtype(vals)):
        vals = vals.view('i8').astype('u8', copy=False)
    elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8):
        vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
    else:
        # With repeated values, its MUCH faster to categorize object dtypes,
        # then hash and rename categories. We allow skipping the categorization
        # when the values are known/likely to be unique.
        if categorize:
            from pandas import factorize, Categorical, Index
            codes, categories = factorize(vals, sort=False)
            cat = Categorical(codes, Index(categories),
                              ordered=False, fastpath=True)
            return _hash_categorical(cat, encoding, hash_key)

        try:
            vals = hashing.hash_object_array(vals, hash_key, encoding)
        except TypeError:
            # we have mixed types
            vals = hashing.hash_object_array(vals.astype(str).astype(object),
                                             hash_key, encoding)

    # Then, redistribute these 64-bit ints within the space of 64-bit ints
    vals ^= vals >> 30
    vals *= np.uint64(0xbf58476d1ce4e5b9)
    vals ^= vals >> 27
    vals *= np.uint64(0x94d049bb133111eb)
    vals ^= vals >> 31
    return vals
コード例 #33
0
def to_datetime(arg,
                errors='raise',
                dayfirst=False,
                yearfirst=False,
                utc=None,
                box=True,
                format=None,
                exact=True,
                unit=None,
                infer_datetime_format=False,
                origin='unix'):
    """
    Convert argument to datetime.

    Parameters
    ----------
    arg : integer, float, string, datetime, list, tuple, 1-d array, Series

        .. versionadded: 0.18.1

           or DataFrame/dict-like

    errors : {'ignore', 'raise', 'coerce'}, default 'raise'

        - If 'raise', then invalid parsing will raise an exception
        - If 'coerce', then invalid parsing will be set as NaT
        - If 'ignore', then invalid parsing will return the input
    dayfirst : boolean, default False
        Specify a date parse order if `arg` is str or its list-likes.
        If True, parses dates with the day first, eg 10/11/12 is parsed as
        2012-11-10.
        Warning: dayfirst=True is not strict, but will prefer to parse
        with day first (this is a known bug, based on dateutil behavior).
    yearfirst : boolean, default False
        Specify a date parse order if `arg` is str or its list-likes.

        - If True parses dates with the year first, eg 10/11/12 is parsed as
          2010-11-12.
        - If both dayfirst and yearfirst are True, yearfirst is preceded (same
          as dateutil).

        Warning: yearfirst=True is not strict, but will prefer to parse
        with year first (this is a known bug, based on dateutil beahavior).

        .. versionadded: 0.16.1

    utc : boolean, default None
        Return UTC DatetimeIndex if True (converting any tz-aware
        datetime.datetime objects as well).
    box : boolean, default True

        - If True returns a DatetimeIndex
        - If False returns ndarray of values.
    format : string, default None
        strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
        all the way up to nanoseconds.
    exact : boolean, True by default

        - If True, require an exact format match.
        - If False, allow the format to match anywhere in the target string.

    unit : string, default 'ns'
        unit of the arg (D,s,ms,us,ns) denote the unit, which is an
        integer or float number. This will be based off the origin.
        Example, with unit='ms' and origin='unix' (the default), this
        would calculate the number of milliseconds to the unix epoch start.
    infer_datetime_format : boolean, default False
        If True and no `format` is given, attempt to infer the format of the
        datetime strings, and if it can be inferred, switch to a faster
        method of parsing them. In some cases this can increase the parsing
        speed by ~5-10x.
    origin : scalar, default is 'unix'
        Define the reference date. The numeric values would be parsed as number
        of units (defined by `unit`) since this reference date.

        - If 'unix' (or POSIX) time; origin is set to 1970-01-01.
        - If 'julian', unit must be 'D', and origin is set to beginning of
          Julian Calendar. Julian day number 0 is assigned to the day starting
          at noon on January 1, 4713 BC.
        - If Timestamp convertible, origin is set to Timestamp identified by
          origin.

        .. versionadded: 0.20.0

    Returns
    -------
    ret : datetime if parsing succeeded.
        Return type depends on input:

        - list-like: DatetimeIndex
        - Series: Series of datetime64 dtype
        - scalar: Timestamp

        In case when it is not possible to return designated types (e.g. when
        any element of input is before Timestamp.min or after Timestamp.max)
        return will have datetime.datetime type (or correspoding array/Series).

    Examples
    --------

    Assembling a datetime from multiple columns of a DataFrame. The keys can be
    common abbreviations like ['year', 'month', 'day', 'minute', 'second',
    'ms', 'us', 'ns']) or plurals of the same

    >>> df = pd.DataFrame({'year': [2015, 2016],
                           'month': [2, 3],
                           'day': [4, 5]})
    >>> pd.to_datetime(df)
    0   2015-02-04
    1   2016-03-05
    dtype: datetime64[ns]

    If a date does not meet the `timestamp limitations
    <http://pandas.pydata.org/pandas-docs/stable/timeseries.html
    #timeseries-timestamp-limits>`_, passing errors='ignore'
    will return the original input instead of raising any exception.

    Passing errors='coerce' will force an out-of-bounds date to NaT,
    in addition to forcing non-dates (or non-parseable dates) to NaT.

    >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore')
    datetime.datetime(1300, 1, 1, 0, 0)
    >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')
    NaT

    Passing infer_datetime_format=True can often-times speedup a parsing
    if its not an ISO8601 format exactly, but in a regular format.

    >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000']*1000)

    >>> s.head()
    0    3/11/2000
    1    3/12/2000
    2    3/13/2000
    3    3/11/2000
    4    3/12/2000
    dtype: object

    >>> %timeit pd.to_datetime(s,infer_datetime_format=True)
    100 loops, best of 3: 10.4 ms per loop

    >>> %timeit pd.to_datetime(s,infer_datetime_format=False)
    1 loop, best of 3: 471 ms per loop

    Using a unix epoch time

    >>> pd.to_datetime(1490195805, unit='s')
    Timestamp('2017-03-22 15:16:45')
    >>> pd.to_datetime(1490195805433502912, unit='ns')
    Timestamp('2017-03-22 15:16:45.433502912')

    .. warning:: For float arg, precision rounding might happen. To prevent
        unexpected behavior use a fixed-width exact type.

    Using a non-unix epoch origin

    >>> pd.to_datetime([1, 2, 3], unit='D',
                       origin=pd.Timestamp('1960-01-01'))
    0    1960-01-02
    1    1960-01-03
    2    1960-01-04

    """
    from pandas.core.indexes.datetimes import DatetimeIndex

    tz = 'utc' if utc else None

    def _convert_listlike(arg, box, format, name=None, tz=tz):

        if isinstance(arg, (list, tuple)):
            arg = np.array(arg, dtype='O')

        # these are shortcutable
        if is_datetime64tz_dtype(arg):
            if not isinstance(arg, DatetimeIndex):
                return DatetimeIndex(arg, tz=tz, name=name)
            if utc:
                arg = arg.tz_convert(None).tz_localize('UTC')
            return arg

        elif is_datetime64_ns_dtype(arg):
            if box and not isinstance(arg, DatetimeIndex):
                try:
                    return DatetimeIndex(arg, tz=tz, name=name)
                except ValueError:
                    pass

            return arg

        elif unit is not None:
            if format is not None:
                raise ValueError("cannot specify both format and unit")
            arg = getattr(arg, 'values', arg)
            result = tslib.array_with_unit_to_datetime(arg,
                                                       unit,
                                                       errors=errors)
            if box:
                if errors == 'ignore':
                    from pandas import Index
                    return Index(result)

                return DatetimeIndex(result, tz=tz, name=name)
            return result
        elif getattr(arg, 'ndim', 1) > 1:
            raise TypeError('arg must be a string, datetime, list, tuple, '
                            '1-d array, or Series')

        arg = _ensure_object(arg)
        require_iso8601 = False

        if infer_datetime_format and format is None:
            format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)

        if format is not None:
            # There is a special fast-path for iso8601 formatted
            # datetime strings, so in those cases don't use the inferred
            # format because this path makes process slower in this
            # special case
            format_is_iso8601 = _format_is_iso(format)
            if format_is_iso8601:
                require_iso8601 = not infer_datetime_format
                format = None

        try:
            result = None

            if format is not None:
                # shortcut formatting here
                if format == '%Y%m%d':
                    try:
                        result = _attempt_YYYYMMDD(arg, errors=errors)
                    except:
                        raise ValueError("cannot convert the input to "
                                         "'%Y%m%d' date format")

                # fallback
                if result is None:
                    try:
                        result = tslib.array_strptime(arg,
                                                      format,
                                                      exact=exact,
                                                      errors=errors)
                    except tslib.OutOfBoundsDatetime:
                        if errors == 'raise':
                            raise
                        result = arg
                    except ValueError:
                        # if format was inferred, try falling back
                        # to array_to_datetime - terminate here
                        # for specified formats
                        if not infer_datetime_format:
                            if errors == 'raise':
                                raise
                            result = arg

            if result is None and (format is None or infer_datetime_format):
                result = tslib.array_to_datetime(
                    arg,
                    errors=errors,
                    utc=utc,
                    dayfirst=dayfirst,
                    yearfirst=yearfirst,
                    require_iso8601=require_iso8601)

            if is_datetime64_dtype(result) and box:
                result = DatetimeIndex(result, tz=tz, name=name)
            return result

        except ValueError as e:
            try:
                values, tz = tslib.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, name=name, tz=tz)
            except (ValueError, TypeError):
                raise e

    if arg is None:
        return None

    # handle origin
    if origin == 'julian':

        original = arg
        j0 = tslib.Timestamp(0).to_julian_date()
        if unit != 'D':
            raise ValueError("unit must be 'D' for origin='julian'")
        try:
            arg = arg - j0
        except:
            raise ValueError("incompatible 'arg' type for given "
                             "'origin'='julian'")

        # premptively check this for a nice range
        j_max = tslib.Timestamp.max.to_julian_date() - j0
        j_min = tslib.Timestamp.min.to_julian_date() - j0
        if np.any(arg > j_max) or np.any(arg < j_min):
            raise tslib.OutOfBoundsDatetime(
                "{original} is Out of Bounds for "
                "origin='julian'".format(original=original))

    elif origin not in ['unix', 'julian']:

        # arg must be a numeric
        original = arg
        if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg)))
                or is_numeric_dtype(np.asarray(arg))):
            raise ValueError(
                "'{arg}' is not compatible with origin='{origin}'; "
                "it must be numeric with a unit specified ".format(
                    arg=arg, origin=origin))

        # we are going to offset back to unix / epoch time
        try:
            offset = tslib.Timestamp(origin) - tslib.Timestamp(0)
        except tslib.OutOfBoundsDatetime:
            raise tslib.OutOfBoundsDatetime(
                "origin {} is Out of Bounds".format(origin))
        except ValueError:
            raise ValueError("origin {} cannot be converted "
                             "to a Timestamp".format(origin))

        # convert the offset to the unit of the arg
        # this should be lossless in terms of precision
        offset = offset // tslib.Timedelta(1, unit=unit)

        # scalars & ndarray-like can handle the addition
        if is_list_like(arg) and not isinstance(
                arg, (ABCSeries, ABCIndexClass, np.ndarray)):
            arg = np.asarray(arg)
        arg = arg + offset

    if isinstance(arg, tslib.Timestamp):
        result = arg
    elif isinstance(arg, ABCSeries):
        from pandas import Series
        values = _convert_listlike(arg._values, False, format)
        result = Series(values, index=arg.index, name=arg.name)
    elif isinstance(arg, (ABCDataFrame, MutableMapping)):
        result = _assemble_from_unit_mappings(arg, errors=errors)
    elif isinstance(arg, ABCIndexClass):
        result = _convert_listlike(arg, box, format, name=arg.name)
    elif is_list_like(arg):
        result = _convert_listlike(arg, box, format)
    else:
        result = _convert_listlike(np.array([arg]), box, format)[0]

    return result
コード例 #34
0
ファイル: numeric.py プロジェクト: christlc/pandas
def to_numeric(arg, errors='raise', downcast=None):
    """
    Convert argument to a numeric type.

    The default return dtype is `float64` or `int64`
    depending on the data supplied. Use the `downcast` parameter
    to obtain other dtypes.

    Parameters
    ----------
    arg : list, tuple, 1-d array, or Series
    errors : {'ignore', 'raise', 'coerce'}, default 'raise'
        - If 'raise', then invalid parsing will raise an exception
        - If 'coerce', then invalid parsing will be set as NaN
        - If 'ignore', then invalid parsing will return the input
    downcast : {'integer', 'signed', 'unsigned', 'float'} , default None
        If not None, and if the data has been successfully cast to a
        numerical dtype (or if the data was numeric to begin with),
        downcast that resulting data to the smallest numerical dtype
        possible according to the following rules:

        - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
        - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
        - 'float': smallest float dtype (min.: np.float32)

        As this behaviour is separate from the core conversion to
        numeric values, any errors raised during the downcasting
        will be surfaced regardless of the value of the 'errors' input.

        In addition, downcasting will only occur if the size
        of the resulting data's dtype is strictly larger than
        the dtype it is to be cast to, so if none of the dtypes
        checked satisfy that specification, no downcasting will be
        performed on the data.

        .. versionadded:: 0.19.0

    Returns
    -------
    ret : numeric if parsing succeeded.
        Return type depends on input.  Series if Series, otherwise ndarray

    Examples
    --------
    Take separate series and convert to numeric, coercing when told to

    >>> s = pd.Series(['1.0', '2', -3])
    >>> pd.to_numeric(s)
    0    1.0
    1    2.0
    2   -3.0
    dtype: float64
    >>> pd.to_numeric(s, downcast='float')
    0    1.0
    1    2.0
    2   -3.0
    dtype: float32
    >>> pd.to_numeric(s, downcast='signed')
    0    1
    1    2
    2   -3
    dtype: int8
    >>> s = pd.Series(['apple', '1.0', '2', -3])
    >>> pd.to_numeric(s, errors='ignore')
    0    apple
    1      1.0
    2        2
    3       -3
    dtype: object
    >>> pd.to_numeric(s, errors='coerce')
    0    NaN
    1    1.0
    2    2.0
    3   -3.0
    dtype: float64

    See Also
    --------
    pandas.DataFrame.astype : Cast argument to a specified dtype.
    pandas.to_datetime : Convert argument to datetime.
    pandas.to_timedelta : Convert argument to timedelta.
    numpy.ndarray.astype : Cast a numpy array to a specified type.
    """
    if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'):
        raise ValueError('invalid downcasting method provided')

    is_series = False
    is_index = False
    is_scalars = False

    if isinstance(arg, ABCSeries):
        is_series = True
        values = arg.values
    elif isinstance(arg, ABCIndexClass):
        is_index = True
        values = arg.asi8
        if values is None:
            values = arg.values
    elif isinstance(arg, (list, tuple)):
        values = np.array(arg, dtype='O')
    elif is_scalar(arg):
        if is_decimal(arg):
            return float(arg)
        if is_number(arg):
            return arg
        is_scalars = True
        values = np.array([arg], dtype='O')
    elif getattr(arg, 'ndim', 1) > 1:
        raise TypeError('arg must be a list, tuple, 1-d array, or Series')
    else:
        values = arg

    try:
        if is_numeric_dtype(values):
            pass
        elif is_datetime_or_timedelta_dtype(values):
            values = values.astype(np.int64)
        else:
            values = ensure_object(values)
            coerce_numeric = False if errors in ('ignore', 'raise') else True
            values = lib.maybe_convert_numeric(values, set(),
                                               coerce_numeric=coerce_numeric)

    except Exception:
        if errors == 'raise':
            raise

    # attempt downcast only if the data has been successfully converted
    # to a numerical dtype and if a downcast method has been specified
    if downcast is not None and is_numeric_dtype(values):
        typecodes = None

        if downcast in ('integer', 'signed'):
            typecodes = np.typecodes['Integer']
        elif downcast == 'unsigned' and np.min(values) >= 0:
            typecodes = np.typecodes['UnsignedInteger']
        elif downcast == 'float':
            typecodes = np.typecodes['Float']

            # pandas support goes only to np.float32,
            # as float dtypes smaller than that are
            # extremely rare and not well supported
            float_32_char = np.dtype(np.float32).char
            float_32_ind = typecodes.index(float_32_char)
            typecodes = typecodes[float_32_ind:]

        if typecodes is not None:
            # from smallest to largest
            for dtype in typecodes:
                if np.dtype(dtype).itemsize <= values.dtype.itemsize:
                    values = maybe_downcast_to_dtype(values, dtype)

                    # successful conversion
                    if values.dtype == dtype:
                        break

    if is_series:
        return pd.Series(values, index=arg.index, name=arg.name)
    elif is_index:
        # because we want to coerce to numeric if possible,
        # do not use _shallow_copy_with_infer
        return pd.Index(values, name=arg.name)
    elif is_scalars:
        return values[0]
    else:
        return values
コード例 #35
0
ファイル: datetimes.py プロジェクト: changhiskhan/pandas
def _adjust_to_origin(arg, origin, unit):
    """
    Helper function for to_datetime.
    Adjust input argument to the specified origin

    Parameters
    ----------
    arg : list, tuple, ndarray, Series, Index
        date to be adjusted
    origin : 'julian' or Timestamp
        origin offset for the arg
    unit : string
        passed unit from to_datetime, must be 'D'

    Returns
    -------
    ndarray or scalar of adjusted date(s)
    """
    if origin == 'julian':
        original = arg
        j0 = Timestamp(0).to_julian_date()
        if unit != 'D':
            raise ValueError("unit must be 'D' for origin='julian'")
        try:
            arg = arg - j0
        except TypeError:
            raise ValueError("incompatible 'arg' type for given "
                             "'origin'='julian'")

        # premptively check this for a nice range
        j_max = Timestamp.max.to_julian_date() - j0
        j_min = Timestamp.min.to_julian_date() - j0
        if np.any(arg > j_max) or np.any(arg < j_min):
            raise tslibs.OutOfBoundsDatetime(
                "{original} is Out of Bounds for "
                "origin='julian'".format(original=original))
    else:
        # arg must be numeric
        if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or
                is_numeric_dtype(np.asarray(arg))):
            raise ValueError(
                "'{arg}' is not compatible with origin='{origin}'; "
                "it must be numeric with a unit specified ".format(
                    arg=arg,
                    origin=origin))

        # we are going to offset back to unix / epoch time
        try:
            offset = Timestamp(origin)
        except tslibs.OutOfBoundsDatetime:
            raise tslibs.OutOfBoundsDatetime(
                "origin {origin} is Out of Bounds".format(origin=origin))
        except ValueError:
            raise ValueError("origin {origin} cannot be converted "
                             "to a Timestamp".format(origin=origin))

        if offset.tz is not None:
            raise ValueError(
                "origin offset {} must be tz-naive".format(offset))
        offset -= Timestamp(0)

        # convert the offset to the unit of the arg
        # this should be lossless in terms of precision
        offset = offset // tslibs.Timedelta(1, unit=unit)

        # scalars & ndarray-like can handle the addition
        if is_list_like(arg) and not isinstance(
                arg, (ABCSeries, ABCIndexClass, np.ndarray)):
            arg = np.asarray(arg)
        arg = arg + offset
    return arg
コード例 #36
0
    def assertPandasEqual(self, left, right, check_exact=True):
        if isinstance(left, pd.DataFrame) and isinstance(right, pd.DataFrame):
            try:
                if LooseVersion(pd.__version__) >= LooseVersion("1.1"):
                    kwargs = dict(check_freq=False)
                else:
                    kwargs = dict()

                if LooseVersion(pd.__version__) < LooseVersion("1.1.1"):
                    # Due to https://github.com/pandas-dev/pandas/issues/35446
                    check_exact = (check_exact and all([
                        is_numeric_dtype(dtype) for dtype in left.dtypes
                    ]) and all(
                        [is_numeric_dtype(dtype) for dtype in right.dtypes]))

                assert_frame_equal(
                    left,
                    right,
                    check_index_type=("equiv"
                                      if len(left.index) > 0 else False),
                    check_column_type=("equiv"
                                       if len(left.columns) > 0 else False),
                    check_exact=check_exact,
                    **kwargs,
                )
            except AssertionError as e:
                msg = (str(e) + "\n\nLeft:\n%s\n%s" % (left, left.dtypes) +
                       "\n\nRight:\n%s\n%s" % (right, right.dtypes))
                raise AssertionError(msg) from e
        elif isinstance(left, pd.Series) and isinstance(right, pd.Series):
            try:
                if LooseVersion(pd.__version__) >= LooseVersion("1.1"):
                    kwargs = dict(check_freq=False)
                else:
                    kwargs = dict()
                if LooseVersion(pd.__version__) < LooseVersion("1.1.1"):
                    # Due to https://github.com/pandas-dev/pandas/issues/35446
                    check_exact = (check_exact and is_numeric_dtype(left.dtype)
                                   and is_numeric_dtype(right.dtype))
                assert_series_equal(
                    left,
                    right,
                    check_index_type=("equiv"
                                      if len(left.index) > 0 else False),
                    check_exact=check_exact,
                    **kwargs,
                )
            except AssertionError as e:
                msg = (str(e) + "\n\nLeft:\n%s\n%s" % (left, left.dtype) +
                       "\n\nRight:\n%s\n%s" % (right, right.dtype))
                raise AssertionError(msg) from e
        elif isinstance(left, pd.Index) and isinstance(right, pd.Index):
            try:
                if LooseVersion(pd.__version__) < LooseVersion("1.1.1"):
                    # Due to https://github.com/pandas-dev/pandas/issues/35446
                    check_exact = (check_exact and is_numeric_dtype(left.dtype)
                                   and is_numeric_dtype(right.dtype))
                assert_index_equal(left, right, check_exact=check_exact)
            except AssertionError as e:
                msg = (str(e) + "\n\nLeft:\n%s\n%s" % (left, left.dtype) +
                       "\n\nRight:\n%s\n%s" % (right, right.dtype))
                raise AssertionError(msg) from e
        else:
            raise ValueError("Unexpected values: (%s, %s)" % (left, right))
コード例 #37
0
def align_text(series: pd.Series) -> List[str]:
    """ Right align numeric data; Left align text data"""
    return [
        "text-align: right;"
        if is_numeric_dtype(series) else "text-align: left" for v in series
    ]
コード例 #38
0
def to_numeric(arg, errors='raise', downcast=None):
    """
    Convert argument to a numeric type.

    Parameters
    ----------
    arg : list, tuple, 1-d array, or Series
    errors : {'ignore', 'raise', 'coerce'}, default 'raise'
        - If 'raise', then invalid parsing will raise an exception
        - If 'coerce', then invalid parsing will be set as NaN
        - If 'ignore', then invalid parsing will return the input
    downcast : {'integer', 'signed', 'unsigned', 'float'} , default None
        If not None, and if the data has been successfully cast to a
        numerical dtype (or if the data was numeric to begin with),
        downcast that resulting data to the smallest numerical dtype
        possible according to the following rules:

        - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
        - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
        - 'float': smallest float dtype (min.: np.float32)

        As this behaviour is separate from the core conversion to
        numeric values, any errors raised during the downcasting
        will be surfaced regardless of the value of the 'errors' input.

        In addition, downcasting will only occur if the size
        of the resulting data's dtype is strictly larger than
        the dtype it is to be cast to, so if none of the dtypes
        checked satisfy that specification, no downcasting will be
        performed on the data.

        .. versionadded:: 0.19.0

    Returns
    -------
    ret : numeric if parsing succeeded.
        Return type depends on input.  Series if Series, otherwise ndarray

    Examples
    --------
    Take separate series and convert to numeric, coercing when told to

    >>> import pandas as pd
    >>> s = pd.Series(['1.0', '2', -3])
    >>> pd.to_numeric(s)
    0    1.0
    1    2.0
    2   -3.0
    dtype: float64
    >>> pd.to_numeric(s, downcast='float')
    0    1.0
    1    2.0
    2   -3.0
    dtype: float32
    >>> pd.to_numeric(s, downcast='signed')
    0    1
    1    2
    2   -3
    dtype: int8
    >>> s = pd.Series(['apple', '1.0', '2', -3])
    >>> pd.to_numeric(s, errors='ignore')
    0    apple
    1      1.0
    2        2
    3       -3
    dtype: object
    >>> pd.to_numeric(s, errors='coerce')
    0    NaN
    1    1.0
    2    2.0
    3   -3.0
    dtype: float64
    """
    if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'):
        raise ValueError('invalid downcasting method provided')

    is_series = False
    is_index = False
    is_scalar = False

    if isinstance(arg, pd.Series):
        is_series = True
        values = arg.values
    elif isinstance(arg, pd.Index):
        is_index = True
        values = arg.asi8
        if values is None:
            values = arg.values
    elif isinstance(arg, (list, tuple)):
        values = np.array(arg, dtype='O')
    elif isscalar(arg):
        if is_decimal(arg):
            return float(arg)
        if is_number(arg):
            return arg
        is_scalar = True
        values = np.array([arg], dtype='O')
    elif getattr(arg, 'ndim', 1) > 1:
        raise TypeError('arg must be a list, tuple, 1-d array, or Series')
    else:
        values = arg

    try:
        if is_numeric_dtype(values):
            pass
        elif is_datetime_or_timedelta_dtype(values):
            values = values.astype(np.int64)
        else:
            values = _ensure_object(values)
            coerce_numeric = False if errors in ('ignore', 'raise') else True
            values = lib.maybe_convert_numeric(values, set(),
                                               coerce_numeric=coerce_numeric)

    except Exception:
        if errors == 'raise':
            raise

    # attempt downcast only if the data has been successfully converted
    # to a numerical dtype and if a downcast method has been specified
    if downcast is not None and is_numeric_dtype(values):
        typecodes = None

        if downcast in ('integer', 'signed'):
            typecodes = np.typecodes['Integer']
        elif downcast == 'unsigned' and np.min(values) >= 0:
            typecodes = np.typecodes['UnsignedInteger']
        elif downcast == 'float':
            typecodes = np.typecodes['Float']

            # pandas support goes only to np.float32,
            # as float dtypes smaller than that are
            # extremely rare and not well supported
            float_32_char = np.dtype(np.float32).char
            float_32_ind = typecodes.index(float_32_char)
            typecodes = typecodes[float_32_ind:]

        if typecodes is not None:
            # from smallest to largest
            for dtype in typecodes:
                if np.dtype(dtype).itemsize <= values.dtype.itemsize:
                    values = maybe_downcast_to_dtype(values, dtype)

                    # successful conversion
                    if values.dtype == dtype:
                        break

    if is_series:
        return pd.Series(values, index=arg.index, name=arg.name)
    elif is_index:
        # because we want to coerce to numeric if possible,
        # do not use _shallow_copy_with_infer
        return Index(values, name=arg.name)
    elif is_scalar:
        return values[0]
    else:
        return values
コード例 #39
0
ファイル: all.py プロジェクト: zhenwenzhang/interpret
def autogen_schema(X,
                   ordinal_max_items=2,
                   feature_names=None,
                   feature_types=None):
    """ Generates data schema for a given dataset as JSON representable.

    Args:
        X: Dataframe/ndarray to build schema from.
        ordinal_max_items: If a numeric column's cardinality
            is at most this integer,
            consider it as ordinal instead of continuous.
        feature_names: Feature names
        feature_types: Feature types

    Returns:
        A dictionary - schema that encapsulates column information,
        such as type and domain.
    """
    schema = OrderedDict()
    col_number = 0
    if isinstance(X, np.ndarray):
        log.warning(
            "Passing a numpy array to schema autogen when it should be dataframe."
        )
        if feature_names is None:
            X = pd.DataFrame(X,
                             columns=[
                                 'col_' + str(i) for i in range(X.shape[1])
                             ]).infer_objects()
        else:
            X = pd.DataFrame(X, columns=feature_names).infer_objects()
    if isinstance(X, NDFrame):
        for name, col_dtype in zip(X.dtypes.index, X.dtypes):
            schema[name] = {}
            if is_numeric_dtype(col_dtype):
                # schema[name]['type'] = 'continuous'
                # TODO: Fix this once we know it works.
                if len(set(X[name])) > ordinal_max_items:
                    schema[name]['type'] = 'continuous'
                else:
                    # TODO: Work with ordinal later.
                    schema[name]['type'] = 'categorical'
                    # schema[name]['type'] = 'ordinal'
                    # schema[name]['order'] = list(set(X[name]))
            elif is_string_dtype(col_dtype):
                schema[name]['type'] = 'categorical'
            else:
                warnings.warn("Unknown column: " + name, RuntimeWarning)
                schema[name]['type'] = 'unknown'
            schema[name]['column_number'] = col_number
            col_number += 1

        # Override if feature_types is passed as arg.
        if feature_types is not None:
            for idx, name in enumerate(X.dtypes.index):
                schema[name]['type'] = feature_types[idx]
    else:
        raise TypeError(
            "GA2M only supports numpy arrays or pandas dataframes.")

    return schema
コード例 #40
0
ファイル: ops.py プロジェクト: bashtage/pandas
    def _cython_operation(self, kind, values, how, axis, min_count=-1,
                          **kwargs):
        assert kind in ['transform', 'aggregate']

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values):
            raise NotImplementedError(
                "categoricals are not support in cython ops ATM")
        elif is_datetime64_any_dtype(values):
            if how in ['add', 'prod', 'cumsum', 'cumprod']:
                raise NotImplementedError(
                    "datetime64 type does not support {} "
                    "operations".format(how))
        elif is_timedelta64_dtype(values):
            if how in ['prod', 'cumprod']:
                raise NotImplementedError(
                    "timedelta64 type does not support {} "
                    "operations".format(how))

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                values = values.swapaxes(0, axis)
            if arity > 1:
                raise NotImplementedError("arity of more than 1 is not "
                                          "supported for the 'how' argument")
            out_shape = (self.ngroups,) + values.shape[1:]

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view('int64')
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int64_or_float64(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        try:
            func = self._get_cython_function(
                kind, how, values, is_numeric)
        except NotImplementedError:
            if is_numeric:
                values = ensure_float64(values)
                func = self._get_cython_function(
                    kind, how, values, is_numeric)
            else:
                raise

        if how == 'rank':
            out_dtype = 'float'
        else:
            if is_numeric:
                out_dtype = '{kind}{itemsize}'.format(
                    kind=values.dtype.kind, itemsize=values.dtype.itemsize)
            else:
                out_dtype = 'object'

        labels, _, _ = self.group_info

        if kind == 'aggregate':
            result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                 fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(
                result, counts, values, labels, func, is_numeric,
                is_datetimelike, min_count)
        elif kind == 'transform':
            result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                 fill_value=np.nan)

            # TODO: min_count
            result = self._transform(
                result, values, labels, func, is_numeric, is_datetimelike,
                **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype('float64')
                result[mask] = np.nan

        if (kind == 'aggregate' and
                self._filter_empty_groups and not counts.all()):
            if result.ndim == 2:
                try:
                    result = lib.row_bool_subset(
                        result, (counts > 0).view(np.uint8))
                except ValueError:
                    result = lib.row_bool_subset_object(
                        ensure_object(result),
                        (counts > 0).view(np.uint8))
            else:
                result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if how in self._name_functions:
            # TODO
            names = self._name_functions[how]()
        else:
            names = None

        if swapped:
            result = result.swapaxes(0, axis)

        return result, names
コード例 #41
0
    def _cython_operation(
        self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
    ):
        assert kind in ["transform", "aggregate"]
        orig_values = values

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values) or is_sparse(values):
            raise NotImplementedError(
                "{dtype} dtype not supported".format(dtype=values.dtype)
            )
        elif is_datetime64_any_dtype(values):
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise NotImplementedError(
                    "datetime64 type does not support {how} operations".format(how=how)
                )
        elif is_timedelta64_dtype(values):
            if how in ["prod", "cumprod"]:
                raise NotImplementedError(
                    "timedelta64 type does not support {how} operations".format(how=how)
                )

        if is_datetime64tz_dtype(values.dtype):
            # Cast to naive; we'll cast back at the end of the function
            # TODO: possible need to reshape?  kludge can be avoided when
            #  2D EA is allowed.
            values = values.view("M8[ns]")

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups,) + values.shape[1:]

        try:
            func = self._get_cython_function(kind, how, values, is_numeric)
        except NotImplementedError:
            if is_numeric:
                try:
                    values = ensure_float64(values)
                except TypeError:
                    if lib.infer_dtype(values, skipna=False) == "complex":
                        values = values.astype(complex)
                    else:
                        raise
                func = self._get_cython_function(kind, how, values, is_numeric)
            else:
                raise

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = "{kind}{itemsize}".format(
                    kind=values.dtype.kind, itemsize=values.dtype.itemsize
                )
            else:
                out_dtype = "object"

        codes, _, _ = self.group_info

        if kind == "aggregate":
            result = _maybe_fill(
                np.empty(out_shape, dtype=out_dtype), fill_value=np.nan
            )
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(
                result, counts, values, codes, func, is_datetimelike, min_count
            )
        elif kind == "transform":
            result = _maybe_fill(
                np.empty_like(values, dtype=out_dtype), fill_value=np.nan
            )

            # TODO: min_count
            result = self._transform(
                result, values, codes, func, is_datetimelike, **kwargs
            )

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all():
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if how in self._name_functions:
            names = self._name_functions[how]()  # type: Optional[List[str]]
        else:
            names = None

        if swapped:
            result = result.swapaxes(0, axis)

        if is_datetime64tz_dtype(orig_values.dtype):
            result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype)
        elif is_datetimelike and kind == "aggregate":
            result = result.astype(orig_values.dtype)

        return result, names
コード例 #42
0
ファイル: datetimes.py プロジェクト: Xbar/pandas
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
                utc=None, box=True, format=None, exact=True,
                unit=None, infer_datetime_format=False, origin='unix',
                cache=False):
    """
    Convert argument to datetime.

    Parameters
    ----------
    arg : integer, float, string, datetime, list, tuple, 1-d array, Series

        .. versionadded:: 0.18.1

           or DataFrame/dict-like

    errors : {'ignore', 'raise', 'coerce'}, default 'raise'

        - If 'raise', then invalid parsing will raise an exception
        - If 'coerce', then invalid parsing will be set as NaT
        - If 'ignore', then invalid parsing will return the input
    dayfirst : boolean, default False
        Specify a date parse order if `arg` is str or its list-likes.
        If True, parses dates with the day first, eg 10/11/12 is parsed as
        2012-11-10.
        Warning: dayfirst=True is not strict, but will prefer to parse
        with day first (this is a known bug, based on dateutil behavior).
    yearfirst : boolean, default False
        Specify a date parse order if `arg` is str or its list-likes.

        - If True parses dates with the year first, eg 10/11/12 is parsed as
          2010-11-12.
        - If both dayfirst and yearfirst are True, yearfirst is preceded (same
          as dateutil).

        Warning: yearfirst=True is not strict, but will prefer to parse
        with year first (this is a known bug, based on dateutil beahavior).

        .. versionadded:: 0.16.1

    utc : boolean, default None
        Return UTC DatetimeIndex if True (converting any tz-aware
        datetime.datetime objects as well).
    box : boolean, default True

        - If True returns a DatetimeIndex
        - If False returns ndarray of values.
    format : string, default None
        strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
        all the way up to nanoseconds.
    exact : boolean, True by default

        - If True, require an exact format match.
        - If False, allow the format to match anywhere in the target string.

    unit : string, default 'ns'
        unit of the arg (D,s,ms,us,ns) denote the unit, which is an
        integer or float number. This will be based off the origin.
        Example, with unit='ms' and origin='unix' (the default), this
        would calculate the number of milliseconds to the unix epoch start.
    infer_datetime_format : boolean, default False
        If True and no `format` is given, attempt to infer the format of the
        datetime strings, and if it can be inferred, switch to a faster
        method of parsing them. In some cases this can increase the parsing
        speed by ~5-10x.
    origin : scalar, default is 'unix'
        Define the reference date. The numeric values would be parsed as number
        of units (defined by `unit`) since this reference date.

        - If 'unix' (or POSIX) time; origin is set to 1970-01-01.
        - If 'julian', unit must be 'D', and origin is set to beginning of
          Julian Calendar. Julian day number 0 is assigned to the day starting
          at noon on January 1, 4713 BC.
        - If Timestamp convertible, origin is set to Timestamp identified by
          origin.

        .. versionadded:: 0.20.0
    cache : boolean, default False
        If True, use a cache of unique, converted dates to apply the datetime
        conversion. May produce sigificant speed-up when parsing duplicate date
        strings, especially ones with timezone offsets.

        .. versionadded:: 0.22.0

    Returns
    -------
    ret : datetime if parsing succeeded.
        Return type depends on input:

        - list-like: DatetimeIndex
        - Series: Series of datetime64 dtype
        - scalar: Timestamp

        In case when it is not possible to return designated types (e.g. when
        any element of input is before Timestamp.min or after Timestamp.max)
        return will have datetime.datetime type (or correspoding array/Series).

    Examples
    --------
    Assembling a datetime from multiple columns of a DataFrame. The keys can be
    common abbreviations like ['year', 'month', 'day', 'minute', 'second',
    'ms', 'us', 'ns']) or plurals of the same

    >>> df = pd.DataFrame({'year': [2015, 2016],
                           'month': [2, 3],
                           'day': [4, 5]})
    >>> pd.to_datetime(df)
    0   2015-02-04
    1   2016-03-05
    dtype: datetime64[ns]

    If a date does not meet the `timestamp limitations
    <http://pandas.pydata.org/pandas-docs/stable/timeseries.html
    #timeseries-timestamp-limits>`_, passing errors='ignore'
    will return the original input instead of raising any exception.

    Passing errors='coerce' will force an out-of-bounds date to NaT,
    in addition to forcing non-dates (or non-parseable dates) to NaT.

    >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore')
    datetime.datetime(1300, 1, 1, 0, 0)
    >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')
    NaT

    Passing infer_datetime_format=True can often-times speedup a parsing
    if its not an ISO8601 format exactly, but in a regular format.

    >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000']*1000)

    >>> s.head()
    0    3/11/2000
    1    3/12/2000
    2    3/13/2000
    3    3/11/2000
    4    3/12/2000
    dtype: object

    >>> %timeit pd.to_datetime(s,infer_datetime_format=True)
    100 loops, best of 3: 10.4 ms per loop

    >>> %timeit pd.to_datetime(s,infer_datetime_format=False)
    1 loop, best of 3: 471 ms per loop

    Using a unix epoch time

    >>> pd.to_datetime(1490195805, unit='s')
    Timestamp('2017-03-22 15:16:45')
    >>> pd.to_datetime(1490195805433502912, unit='ns')
    Timestamp('2017-03-22 15:16:45.433502912')

    .. warning:: For float arg, precision rounding might happen. To prevent
        unexpected behavior use a fixed-width exact type.

    Using a non-unix epoch origin

    >>> pd.to_datetime([1, 2, 3], unit='D',
                       origin=pd.Timestamp('1960-01-01'))
    0    1960-01-02
    1    1960-01-03
    2    1960-01-04

    See also
    --------
    pandas.DataFrame.astype : Cast argument to a specified dtype.
    pandas.to_timedelta : Convert argument to timedelta.
    """
    from pandas.core.indexes.datetimes import DatetimeIndex

    tz = 'utc' if utc else None

    def _convert_listlike(arg, box, format, name=None, tz=tz):

        if isinstance(arg, (list, tuple)):
            arg = np.array(arg, dtype='O')

        # these are shortcutable
        if is_datetime64tz_dtype(arg):
            if not isinstance(arg, DatetimeIndex):
                return DatetimeIndex(arg, tz=tz, name=name)
            if utc:
                arg = arg.tz_convert(None).tz_localize('UTC')
            return arg

        elif is_datetime64_ns_dtype(arg):
            if box and not isinstance(arg, DatetimeIndex):
                try:
                    return DatetimeIndex(arg, tz=tz, name=name)
                except ValueError:
                    pass

            return arg

        elif unit is not None:
            if format is not None:
                raise ValueError("cannot specify both format and unit")
            arg = getattr(arg, 'values', arg)
            result = tslib.array_with_unit_to_datetime(arg, unit,
                                                       errors=errors)
            if box:
                if errors == 'ignore':
                    from pandas import Index
                    return Index(result)

                return DatetimeIndex(result, tz=tz, name=name)
            return result
        elif getattr(arg, 'ndim', 1) > 1:
            raise TypeError('arg must be a string, datetime, list, tuple, '
                            '1-d array, or Series')

        arg = _ensure_object(arg)
        require_iso8601 = False

        if infer_datetime_format and format is None:
            format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)

        if format is not None:
            # There is a special fast-path for iso8601 formatted
            # datetime strings, so in those cases don't use the inferred
            # format because this path makes process slower in this
            # special case
            format_is_iso8601 = _format_is_iso(format)
            if format_is_iso8601:
                require_iso8601 = not infer_datetime_format
                format = None

        try:
            result = None

            if format is not None:
                # shortcut formatting here
                if format == '%Y%m%d':
                    try:
                        result = _attempt_YYYYMMDD(arg, errors=errors)
                    except:
                        raise ValueError("cannot convert the input to "
                                         "'%Y%m%d' date format")

                # fallback
                if result is None:
                    try:
                        result = array_strptime(arg, format, exact=exact,
                                                errors=errors)
                    except tslib.OutOfBoundsDatetime:
                        if errors == 'raise':
                            raise
                        result = arg
                    except ValueError:
                        # if format was inferred, try falling back
                        # to array_to_datetime - terminate here
                        # for specified formats
                        if not infer_datetime_format:
                            if errors == 'raise':
                                raise
                            result = arg

            if result is None and (format is None or infer_datetime_format):
                result = tslib.array_to_datetime(
                    arg,
                    errors=errors,
                    utc=utc,
                    dayfirst=dayfirst,
                    yearfirst=yearfirst,
                    require_iso8601=require_iso8601
                )

            if is_datetime64_dtype(result) and box:
                result = DatetimeIndex(result, tz=tz, name=name)
            return result

        except ValueError as e:
            try:
                values, tz = conversion.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, name=name, tz=tz)
            except (ValueError, TypeError):
                raise e

    if arg is None:
        return None

    # handle origin
    if origin == 'julian':

        original = arg
        j0 = tslib.Timestamp(0).to_julian_date()
        if unit != 'D':
            raise ValueError("unit must be 'D' for origin='julian'")
        try:
            arg = arg - j0
        except:
            raise ValueError("incompatible 'arg' type for given "
                             "'origin'='julian'")

        # premptively check this for a nice range
        j_max = tslib.Timestamp.max.to_julian_date() - j0
        j_min = tslib.Timestamp.min.to_julian_date() - j0
        if np.any(arg > j_max) or np.any(arg < j_min):
            raise tslib.OutOfBoundsDatetime(
                "{original} is Out of Bounds for "
                "origin='julian'".format(original=original))

    elif origin not in ['unix', 'julian']:

        # arg must be a numeric
        original = arg
        if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or
                is_numeric_dtype(np.asarray(arg))):
            raise ValueError(
                "'{arg}' is not compatible with origin='{origin}'; "
                "it must be numeric with a unit specified ".format(
                    arg=arg,
                    origin=origin))

        # we are going to offset back to unix / epoch time
        try:
            offset = tslib.Timestamp(origin)
        except tslib.OutOfBoundsDatetime:
            raise tslib.OutOfBoundsDatetime(
                "origin {origin} is Out of Bounds".format(origin=origin))
        except ValueError:
            raise ValueError("origin {origin} cannot be converted "
                             "to a Timestamp".format(origin=origin))

        if offset.tz is not None:
            raise ValueError(
                "origin offset {} must be tz-naive".format(offset))
        offset -= tslib.Timestamp(0)

        # convert the offset to the unit of the arg
        # this should be lossless in terms of precision
        offset = offset // tslib.Timedelta(1, unit=unit)

        # scalars & ndarray-like can handle the addition
        if is_list_like(arg) and not isinstance(
                arg, (ABCSeries, ABCIndexClass, np.ndarray)):
            arg = np.asarray(arg)
        arg = arg + offset

    if isinstance(arg, tslib.Timestamp):
        result = arg
    elif isinstance(arg, ABCSeries):
        cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
        if not cache_array.empty:
            result = arg.map(cache_array)
        else:
            from pandas import Series
            values = _convert_listlike(arg._values, True, format)
            result = Series(values, index=arg.index, name=arg.name)
    elif isinstance(arg, (ABCDataFrame, MutableMapping)):
        result = _assemble_from_unit_mappings(arg, errors=errors)
    elif isinstance(arg, ABCIndexClass):
        cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
        if not cache_array.empty:
            result = _convert_and_box_cache(arg, cache_array, box, errors,
                                            name=arg.name)
        else:
            result = _convert_listlike(arg, box, format, name=arg.name)
    elif is_list_like(arg):
        cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
        if not cache_array.empty:
            result = _convert_and_box_cache(arg, cache_array, box, errors)
        else:
            result = _convert_listlike(arg, box, format)
    else:
        result = _convert_listlike(np.array([arg]), box, format)[0]

    return result
コード例 #43
0
ファイル: boolean.py プロジェクト: JacobGreen770/Pancham
def coerce_to_array(values, mask=None, copy: bool = False):
    """
    Coerce the input values array to numpy arrays with a mask.

    Parameters
    ----------
    values : 1D list-like
    mask : bool 1D array, optional
    copy : bool, default False
        if True, copy the input

    Returns
    -------
    tuple of (values, mask)
    """
    if isinstance(values, BooleanArray):
        if mask is not None:
            raise ValueError("cannot pass mask for BooleanArray input")
        values, mask = values._data, values._mask
        if copy:
            values = values.copy()
            mask = mask.copy()
        return values, mask

    mask_values = None
    if isinstance(values, np.ndarray) and values.dtype == np.bool_:
        if copy:
            values = values.copy()
    elif isinstance(values, np.ndarray) and is_numeric_dtype(values.dtype):
        mask_values = isna(values)

        values_bool = np.zeros(len(values), dtype=bool)
        values_bool[~mask_values] = values[~mask_values].astype(bool)

        if not np.all(values_bool[~mask_values].astype(values.dtype) ==
                      values[~mask_values]):
            raise TypeError("Need to pass bool-like values")

        values = values_bool
    else:
        values_object = np.asarray(values, dtype=object)

        inferred_dtype = lib.infer_dtype(values_object, skipna=True)
        integer_like = ("floating", "integer", "mixed-integer-float")
        if inferred_dtype not in ("boolean", "empty") + integer_like:
            raise TypeError("Need to pass bool-like values")

        mask_values = isna(values_object)
        values = np.zeros(len(values), dtype=bool)
        values[~mask_values] = values_object[~mask_values].astype(bool)

        # if the values were integer-like, validate it were actually 0/1's
        if inferred_dtype in integer_like:
            if not np.all(values[~mask_values].astype(float) ==
                          values_object[~mask_values].astype(float)):
                raise TypeError("Need to pass bool-like values")

    if mask is None and mask_values is None:
        mask = np.zeros(len(values), dtype=bool)
    elif mask is None:
        mask = mask_values
    else:
        if isinstance(mask, np.ndarray) and mask.dtype == np.bool_:
            if mask_values is not None:
                mask = mask | mask_values
            else:
                if copy:
                    mask = mask.copy()
        else:
            mask = np.array(mask, dtype=bool)
            if mask_values is not None:
                mask = mask | mask_values

    if not values.ndim == 1:
        raise ValueError("values must be a 1D list-like")
    if not mask.ndim == 1:
        raise ValueError("mask must be a 1D list-like")

    return values, mask