Ejemplo n.º 1
0
def test_binary_input_aligns_columns(request, dtype_a, dtype_b):
    if (is_extension_array_dtype(dtype_a) or isinstance(dtype_a, dict)
            or is_extension_array_dtype(dtype_b) or isinstance(dtype_b, dict)):
        request.node.add_marker(
            pytest.mark.xfail(
                reason="Extension / mixed with multiple inputs not implemented."
            ))

    df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}).astype(dtype_a)

    if isinstance(dtype_a, dict) and isinstance(dtype_b, dict):
        dtype_b["C"] = dtype_b.pop("B")

    df2 = pd.DataFrame({"A": [1, 2], "C": [3, 4]}).astype(dtype_b)
    with tm.assert_produces_warning(FutureWarning):
        result = np.heaviside(df1, df2)
    # Expected future behaviour:
    # expected = np.heaviside(
    #     np.array([[1, 3, np.nan], [2, 4, np.nan]]),
    #     np.array([[1, np.nan, 3], [2, np.nan, 4]]),
    # )
    # expected = pd.DataFrame(expected, index=[0, 1], columns=["A", "B", "C"])
    expected = pd.DataFrame([[1.0, 1.0], [1.0, 1.0]], columns=["A", "B"])
    tm.assert_frame_equal(result, expected)

    # ensure the expected is the same when applying with numpy array
    result = np.heaviside(df1, df2.values)
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 2
0
    def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
        if exc is None:
            sdtype = tm.get_dtype(s)

            if (hasattr(other, "dtype")
                    and not is_extension_array_dtype(other.dtype)
                    and is_integer_dtype(other.dtype)
                    and sdtype.is_unsigned_integer):
                # TODO: comment below is inaccurate; other can be int8, int16, ...
                #  and the trouble is that e.g. if s is UInt8 and other is int8,
                #  then result is UInt16
                # other is np.int64 and would therefore always result in
                # upcasting, so keeping other as same numpy_dtype
                other = other.astype(sdtype.numpy_dtype)

            result = op(s, other)
            expected = self._combine(s, other, op)

            if op_name in ("__rtruediv__", "__truediv__", "__div__"):
                expected = expected.fillna(np.nan).astype("Float64")
            else:
                # combine method result in 'biggest' (int64) dtype
                expected = expected.astype(sdtype)

            self.assert_equal(result, expected)
        else:
            with pytest.raises(exc):
                op(s, other)
Ejemplo n.º 3
0
def test_binary_input_aligns_index(request, dtype):
    if is_extension_array_dtype(dtype) or isinstance(dtype, dict):
        request.node.add_marker(
            pytest.mark.xfail(
                reason="Extension / mixed with multiple inputs not implemented."
            ))
    df1 = pd.DataFrame({
        "A": [1, 2],
        "B": [3, 4]
    }, index=["a", "b"]).astype(dtype)
    df2 = pd.DataFrame({
        "A": [1, 2],
        "B": [3, 4]
    }, index=["a", "c"]).astype(dtype)
    with tm.assert_produces_warning(FutureWarning):
        result = np.heaviside(df1, df2)
    # Expected future behaviour:
    # expected = np.heaviside(
    #     np.array([[1, 3], [3, 4], [np.nan, np.nan]]),
    #     np.array([[1, 3], [np.nan, np.nan], [3, 4]]),
    # )
    # # TODO(FloatArray): this will be Float64Dtype.
    # expected = pd.DataFrame(expected, index=["a", "b", "c"], columns=["A", "B"])
    expected = pd.DataFrame([[1.0, 1.0], [1.0, 1.0]],
                            columns=["A", "B"],
                            index=["a", "b"])
    tm.assert_frame_equal(result, expected)

    # ensure the expected is the same when applying with numpy array
    result = np.heaviside(df1, df2.values)
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 4
0
def block_to_header_bytes(block):
    values = block.values
    try:
        # pandas >= 0.19
        from pandas.api.types import is_datetime64tz_dtype
    except ImportError:
        from pandas.core.common import is_datetime64tz_dtype

    if isinstance(values, pd.Categorical):
        extension = ('categorical_type', (values.ordered, values.categories))
        values = values.codes
    elif is_datetime64tz_dtype(block):
        extension = ('datetime64_tz_type', (block.values.tzinfo, ))
        values = values.view('i8')
    elif is_extension_array_dtype(block.dtype):
        extension = ("other", ())
    else:
        extension = ('numpy_type', ())

    header = (block.mgr_locs.as_array, values.dtype, values.shape, extension)
    if extension == ("other", ()):
        bytes = pickle.dumps(values)
    else:
        bytes = pnp.compress(pnp.serialize(values), values.dtype)
    return header, bytes
Ejemplo n.º 5
0
    def dtype(self) -> Optional[str]:
        """String representation of the dtype."""
        dtype_ = self._pandas_dtype
        if dtype_ is None:
            return dtype_

        if is_extension_array_dtype(dtype_):
            if isinstance(dtype_, type):
                try:
                    # Convert to str here because some pandas dtypes allow
                    # an empty constructor for compatatibility but fail on str().
                    # e.g: PeriodDtype
                    return str(dtype_())
                except (TypeError, AttributeError) as err:
                    raise TypeError(
                        f"Pandas dtype {dtype_} cannot be instantiated: "
                        f"{err}\n Usage Tip: Use an instance or a string "
                        "representation.") from err
            return str(dtype_)

        if dtype_ in dtypes.NUMPY_TYPES:
            dtype_ = PandasDtype.from_numpy_type(dtype_)
        elif isinstance(dtype_, str):
            dtype_ = PandasDtype.from_str_alias(dtype_)
        elif isinstance(dtype_, type):
            dtype_ = PandasDtype.from_python_type(dtype_)

        if isinstance(dtype_, dtypes.PandasDtype):
            return dtype_.str_alias
        raise TypeError(
            "type of `pandas_dtype` argument not recognized: %s "
            "Please specify a pandera PandasDtype enum, legal pandas data "
            "type, pandas data type string alias, or numpy data type "
            "string alias" % type(self._pandas_dtype))
Ejemplo n.º 6
0
    def astype(self, dtype, copy=True):
        dtype = pandas_dtype(dtype)
        if isinstance(dtype, RaggedDtype):
            if copy:
                return self.copy()
            return self

        elif is_extension_array_dtype(dtype):
            return dtype.construct_array_type()._from_sequence(
                np.asarray(self))

        return np.array([v for v in self], dtype=dtype, copy=copy)
Ejemplo n.º 7
0
def test_unary_binary(request, dtype):
    # unary input, binary output
    if is_extension_array_dtype(dtype) or isinstance(dtype, dict):
        request.node.add_marker(
            pytest.mark.xfail(
                reason=
                "Extension / mixed with multiple outputs not implemented."))

    values = np.array([[-1, -1], [1, 1]], dtype="int64")
    df = pd.DataFrame(values, columns=["A", "B"],
                      index=["a", "b"]).astype(dtype=dtype)
    result_pandas = np.modf(df)
    assert isinstance(result_pandas, tuple)
    assert len(result_pandas) == 2
    expected_numpy = np.modf(values)

    for result, b in zip(result_pandas, expected_numpy):
        expected = pd.DataFrame(b, index=df.index, columns=df.columns)
        tm.assert_frame_equal(result, expected)
Ejemplo n.º 8
0
    def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
        if exc is None:
            if s.dtype.is_unsigned_integer and (op_name == "__rsub__"):
                # TODO see https://github.com/pandas-dev/pandas/issues/22023
                pytest.skip("unsigned subtraction gives negative values")

            if (
                hasattr(other, "dtype")
                and not is_extension_array_dtype(other.dtype)
                and is_integer_dtype(other.dtype)
            ):
                # other is np.int64 and would therefore always result in
                # upcasting, so keeping other as same numpy_dtype
                other = other.astype(s.dtype.numpy_dtype)

            result = op(s, other)
            expected = s.combine(other, op)

            if op_name in ("__rtruediv__", "__truediv__", "__div__"):
                expected = expected.fillna(np.nan).astype("Float64")
            elif op_name.startswith("__r"):
                # TODO reverse operators result in object dtype
                # see https://github.com/pandas-dev/pandas/issues/22024
                expected = expected.astype(s.dtype)
                result = result.astype(s.dtype)
            else:
                # combine method result in 'biggest' (int64) dtype
                expected = expected.astype(s.dtype)
                pass

            if (op_name == "__rpow__") and isinstance(other, pd.Series):
                # TODO pow on Int arrays gives different result with NA
                # see https://github.com/pandas-dev/pandas/issues/22022
                result = result.fillna(1)

            self.assert_series_equal(result, expected)
        else:
            with pytest.raises(exc):
                op(s, other)
Ejemplo n.º 9
0
    def get_str_dtype(cls, pandas_dtype_arg):
        """Get pandas-compatible string representation of dtype."""
        dtype_ = pandas_dtype_arg
        if dtype_ is None:
            return dtype_

        if is_extension_array_dtype(dtype_):
            if isinstance(dtype_, type):
                try:
                    # Convert to str here because some pandas dtypes allow
                    # an empty constructor for compatatibility but fail on
                    # str(). e.g: PeriodDtype
                    return str(dtype_())
                except (TypeError, AttributeError) as err:
                    raise TypeError(
                        f"Pandas dtype {dtype_} cannot be instantiated: "
                        f"{err}\n Usage Tip: Use an instance or a string "
                        "representation."
                    ) from err
            return str(dtype_)

        if dtype_ in NUMPY_TYPES:
            dtype_ = cls.from_numpy_type(dtype_)
        elif isinstance(dtype_, str):
            dtype_ = cls.from_str_alias(dtype_)
        elif isinstance(dtype_, type):
            dtype_ = cls.from_python_type(dtype_)

        if isinstance(dtype_, cls):
            return dtype_.str_alias
        raise TypeError(
            "type of `pandas_dtype` argument not recognized: "
            f"{type(pandas_dtype_arg)}. Please specify a pandera PandasDtype "
            "enum, legal pandas data type, pandas data type string alias, or "
            "numpy data type string alias"
        )
Ejemplo n.º 10
0
 def time_is_extension_array_dtype_false(self):
     is_extension_array_dtype(self.np_dtype)
Ejemplo n.º 11
0
 def time_is_extension_array_dtype_true(self):
     is_extension_array_dtype(self.ext_dtype)
Ejemplo n.º 12
0
    def numeric(self) -> pd.DataFrame:
        """
        Descriptive statistics for numeric data

        Returns
        -------
        DataFrame
            The statistics of the numeric columns
        """
        df: pd.DataFrame = self._data.loc[:, self._is_numeric]
        cols = df.columns
        _, k = df.shape
        std = df.std()
        count = df.count()
        mean = df.mean()
        mad = (df - mean).abs().mean()
        std_err = std.copy()
        std_err.loc[count > 0] /= count.loc[count > 0]
        if self._use_t:
            q = stats.t(count - 1).ppf(1.0 - self._alpha / 2)
        else:
            q = stats.norm.ppf(1.0 - self._alpha / 2)

        def _mode(ser):
            mode_res = stats.mode(ser.dropna())
            if mode_res[0].shape[0] > 0:
                return [float(val) for val in mode_res]
            return np.nan, np.nan

        mode_values = df.apply(_mode).T
        if mode_values.size > 0:
            if isinstance(mode_values, pd.DataFrame):
                # pandas 1.0 or later
                mode = np.asarray(mode_values[0], dtype=float)
                mode_counts = np.asarray(mode_values[1], dtype=np.int64)
            else:
                # pandas before 1.0 returns a Series of 2-elem list
                mode = []
                mode_counts = []
                for idx in mode_values.index:
                    val = mode_values.loc[idx]
                    mode.append(val[0])
                    mode_counts.append(val[1])
                mode = np.atleast_1d(mode)
                mode_counts = np.atleast_1d(mode_counts)
        else:
            mode = mode_counts = np.empty(0)
        loc = count > 0
        mode_freq = np.full(mode.shape[0], np.nan)
        mode_freq[loc] = mode_counts[loc] / count.loc[loc]
        # TODO: Workaround for pandas AbstractMethodError in extension
        #  types. Remove when quantile is supported for these
        _df = df
        try:
            from pandas.api.types import is_extension_array_dtype
            _df = df.copy()
            for col in df:
                if is_extension_array_dtype(df[col].dtype):
                    _df[col] = _df[col].astype(object).fillna(np.nan)
        except ImportError:
            pass

        if df.shape[1] > 0:
            iqr = _df.quantile(0.75) - _df.quantile(0.25)
        else:
            iqr = mean

        def _safe_jarque_bera(c):
            a = np.asarray(c)
            if a.shape[0] < 2:
                return (np.nan, ) * 4
            return jarque_bera(a)

        jb = df.apply(lambda x: list(_safe_jarque_bera(x.dropna())),
                      result_type="expand").T
        nan_mean = mean.copy()
        nan_mean.loc[nan_mean == 0] = np.nan
        coef_var = std / nan_mean

        results = {
            "nobs": pd.Series(np.ones(k, dtype=np.int64) * df.shape[0],
                              index=cols),
            "missing": df.shape[0] - count,
            "mean": mean,
            "std_err": std_err,
            "upper_ci": mean + q * std_err,
            "lower_ci": mean - q * std_err,
            "std": std,
            "iqr": iqr,
            "mad": mad,
            "coef_var": coef_var,
            "range": pd_ptp(df),
            "max": df.max(),
            "min": df.min(),
            "skew": jb[2],
            "kurtosis": jb[3],
            "iqr_normal": iqr / np.diff(stats.norm.ppf([0.25, 0.75])),
            "mad_normal": mad / np.sqrt(2 / np.pi),
            "jarque_bera": jb[0],
            "jarque_bera_pval": jb[1],
            "mode": pd.Series(mode, index=cols),
            "mode_freq": pd.Series(mode_freq, index=cols),
            "median": df.median(),
        }
        final = {k: v for k, v in results.items() if k in self._stats}
        results_df = pd.DataFrame(list(final.values()),
                                  columns=cols,
                                  index=list(final.keys()))
        if "percentiles" not in self._stats:
            return results_df
        # Pandas before 1.0 cannot handle empty DF
        if df.shape[1] > 0:
            # TODO: Remove when extension types support quantile
            perc = _df.quantile(self._percentiles / 100).astype(float)
        else:
            perc = pd.DataFrame(index=self._percentiles / 100, dtype=float)
        if np.all(np.floor(100 * perc.index) == (100 * perc.index)):
            perc.index = [f"{int(100 * idx)}%" for idx in perc.index]
        else:
            dupe = True
            scale = 100
            index = perc.index
            while dupe:
                scale *= 10
                idx = np.floor(scale * perc.index)
                if np.all(np.diff(idx) > 0):
                    dupe = False
            index = np.floor(scale * index) / (scale / 100)
            fmt = f"0.{len(str(scale//100))-1}f"
            output = f"{{0:{fmt}}}%"
            perc.index = [output.format(val) for val in index]

        return self._reorder(pd.concat([results_df, perc], 0))