Example #1
0
    def test_quantile_datetime(self):
        df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]})

        # exclude datetime
        result = df.quantile(.5)
        expected = Series([2.5], index=['b'])

        # datetime
        result = df.quantile(.5, numeric_only=False)
        expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5],
                          index=['a', 'b'],
                          name=0.5)
        assert_series_equal(result, expected)

        # datetime w/ multi
        result = df.quantile([.5], numeric_only=False)
        expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]],
                             index=[.5], columns=['a', 'b'])
        assert_frame_equal(result, expected)

        # axis = 1
        df['c'] = pd.to_datetime(['2011', '2012'])
        result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False)
        expected = Series([Timestamp('2010-07-02 12:00:00'),
                           Timestamp('2011-07-02 12:00:00')],
                          index=[0, 1],
                          name=0.5)
        assert_series_equal(result, expected)

        result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False)
        expected = DataFrame([[Timestamp('2010-07-02 12:00:00'),
                               Timestamp('2011-07-02 12:00:00')]],
                             index=[0.5], columns=[0, 1])
        assert_frame_equal(result, expected)
Example #2
0
    def test_quantile_nat(self):

        # full NaT column
        df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]})

        res = df.quantile(0.5, numeric_only=False)
        exp = Series([pd.NaT], index=['a'], name=0.5)
        tm.assert_series_equal(res, exp)

        res = df.quantile([0.5], numeric_only=False)
        exp = DataFrame({'a': [pd.NaT]}, index=[0.5])
        tm.assert_frame_equal(res, exp)

        # mixed non-null / full null column
        df = DataFrame({'a': [pd.Timestamp('2012-01-01'),
                              pd.Timestamp('2012-01-02'),
                              pd.Timestamp('2012-01-03')],
                        'b': [pd.NaT, pd.NaT, pd.NaT]})

        res = df.quantile(0.5, numeric_only=False)
        exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'],
                     name=0.5)
        tm.assert_series_equal(res, exp)

        res = df.quantile([0.5], numeric_only=False)
        exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5],
                        columns=['a', 'b'])
        tm.assert_frame_equal(res, exp)
Example #3
0
    def test_quantile_empty(self):

        # floats
        df = DataFrame(columns=['a', 'b'], dtype='float64')

        res = df.quantile(0.5)
        exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5)
        tm.assert_series_equal(res, exp)

        res = df.quantile([0.5])
        exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5])
        tm.assert_frame_equal(res, exp)

        # FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
        # res = df.quantile(0.5, axis=1)
        # res = df.quantile([0.5], axis=1)

        # ints
        df = DataFrame(columns=['a', 'b'], dtype='int64')

        # FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
        # res = df.quantile(0.5)

        # datetimes
        df = DataFrame(columns=['a', 'b'], dtype='datetime64[ns]')
Example #4
0
    def test_quantile_interpolation(self):
        # see gh-10174
        from numpy import percentile

        # interpolation = linear (default case)
        q = self.tsframe.quantile(0.1, axis=0, interpolation='linear')
        assert q['A'] == percentile(self.tsframe['A'], 10)
        q = self.intframe.quantile(0.1)
        assert q['A'] == percentile(self.intframe['A'], 10)

        # test with and without interpolation keyword
        q1 = self.intframe.quantile(0.1)
        assert q1['A'] == np.percentile(self.intframe['A'], 10)
        tm.assert_series_equal(q, q1)

        # interpolation method other than default linear
        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
        result = df.quantile(.5, axis=1, interpolation='nearest')
        expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5)
        tm.assert_series_equal(result, expected)

        # cross-check interpolation=nearest results in original dtype
        exp = np.percentile(np.array([[1, 2, 3], [2, 3, 4]]), .5,
                            axis=0, interpolation='nearest')
        expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='int64')
        tm.assert_series_equal(result, expected)

        # float
        df = DataFrame({"A": [1., 2., 3.], "B": [2., 3., 4.]}, index=[1, 2, 3])
        result = df.quantile(.5, axis=1, interpolation='nearest')
        expected = Series([1., 2., 3.], index=[1, 2, 3], name=0.5)
        tm.assert_series_equal(result, expected)
        exp = np.percentile(np.array([[1., 2., 3.], [2., 3., 4.]]), .5,
                            axis=0, interpolation='nearest')
        expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='float64')
        assert_series_equal(result, expected)

        # axis
        result = df.quantile([.5, .75], axis=1, interpolation='lower')
        expected = DataFrame({1: [1., 1.], 2: [2., 2.],
                              3: [3., 3.]}, index=[0.5, 0.75])
        assert_frame_equal(result, expected)

        # test degenerate case
        df = DataFrame({'x': [], 'y': []})
        q = df.quantile(0.1, axis=0, interpolation='higher')
        assert(np.isnan(q['x']) and np.isnan(q['y']))

        # multi
        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
                       columns=['a', 'b', 'c'])
        result = df.quantile([.25, .5], interpolation='midpoint')

        # https://github.com/numpy/numpy/issues/7163
        expected = DataFrame([[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
                             index=[.25, .5], columns=['a', 'b', 'c'])
        assert_frame_equal(result, expected)
Example #5
0
class Quantile(object):

    params = [0, 1]
    param_names = ['axis']

    def setup(self, axis):
        self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC'))

    def time_frame_quantile(self, axis):
        self.df.quantile([0.1, 0.5], axis=axis)
Example #6
0
    def test_quantile_axis_mixed(self):

        # mixed on axis=1
        df = DataFrame({"A": [1, 2, 3],
                        "B": [2., 3., 4.],
                        "C": pd.date_range('20130101', periods=3),
                        "D": ['foo', 'bar', 'baz']})
        result = df.quantile(.5, axis=1)
        expected = Series([1.5, 2.5, 3.5], name=0.5)
        assert_series_equal(result, expected)

        # must raise
        with pytest.raises(TypeError):
            df.quantile(.5, axis=1, numeric_only=False)
Example #7
0
    def test_quantile_axis_parameter(self):
        # GH 9543/9544

        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])

        result = df.quantile(.5, axis=0)

        expected = Series([2., 3.], index=["A", "B"], name=0.5)
        assert_series_equal(result, expected)

        expected = df.quantile(.5, axis="index")
        assert_series_equal(result, expected)

        result = df.quantile(.5, axis=1)

        expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
        assert_series_equal(result, expected)

        result = df.quantile(.5, axis="columns")
        assert_series_equal(result, expected)

        msg = ("No axis named -1 for object type"
               " <class 'pandas.core.frame.DataFrame'>")
        with pytest.raises(ValueError, match=msg):
            df.quantile(0.1, axis=-1)
        msg = ("No axis named column for object type"
               " <class 'pandas.core.frame.DataFrame'>")
        with pytest.raises(ValueError, match=msg):
            df.quantile(0.1, axis="column")
Example #8
0
    def test_quantile_multi(self):
        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
                       columns=['a', 'b', 'c'])
        result = df.quantile([.25, .5])
        expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
                             index=[.25, .5], columns=['a', 'b', 'c'])
        assert_frame_equal(result, expected)

        # axis = 1
        result = df.quantile([.25, .5], axis=1)
        expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
                             index=[.25, .5], columns=[0, 1, 2])

        # empty
        result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0)
        expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]},
                             index=[.1, .9])
        assert_frame_equal(result, expected)
Example #9
0
 def test_empty_datelike(
     self, dtype, expected_data, expected_index, axis, expected_dtype
 ):
     # GH 14564
     df = DataFrame(columns=["a", "b"], dtype=dtype)
     result = df.quantile(0.5, axis=axis, numeric_only=False)
     expected = Series(
         expected_data, name=0.5, index=Index(expected_index), dtype=expected_dtype
     )
     tm.assert_series_equal(result, expected)
Example #10
0
    def test_quantile(self, datetime_frame):
        from numpy import percentile

        df = datetime_frame
        q = df.quantile(0.1, axis=0, numeric_only=True)
        assert q["A"] == percentile(df["A"], 10)
        tm.assert_index_equal(q.index, df.columns)

        q = df.quantile(0.9, axis=1, numeric_only=True)
        assert q["2000-01-17"] == percentile(df.loc["2000-01-17"], 90)
        tm.assert_index_equal(q.index, df.index)

        # test degenerate case
        q = DataFrame({"x": [], "y": []}).quantile(0.1, axis=0, numeric_only=True)
        assert np.isnan(q["x"]) and np.isnan(q["y"])

        # non-numeric exclusion
        df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]})
        rs = df.quantile(0.5, numeric_only=True)
        with tm.assert_produces_warning(FutureWarning, match="Select only valid"):
            xp = df.median().rename(0.5)
        tm.assert_series_equal(rs, xp)

        # axis
        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
        result = df.quantile(0.5, axis=1)
        expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
        tm.assert_series_equal(result, expected)

        result = df.quantile([0.5, 0.75], axis=1)
        expected = DataFrame(
            {1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75]}, index=[0.5, 0.75]
        )
        tm.assert_frame_equal(result, expected, check_index_type=True)

        # We may want to break API in the future to change this
        # so that we exclude non-numeric along the same axis
        # See GH #7312
        df = DataFrame([[1, 2, 3], ["a", "b", 4]])
        result = df.quantile(0.5, axis=1, numeric_only=True)
        expected = Series([3.0, 4.0], index=[0, 1], name=0.5)
        tm.assert_series_equal(result, expected)
Example #11
0
    def test_quantile(self, datetime_frame):
        from numpy import percentile

        df = datetime_frame
        q = df.quantile(0.1, axis=0)
        assert q['A'] == percentile(df['A'], 10)
        tm.assert_index_equal(q.index, df.columns)

        q = df.quantile(0.9, axis=1)
        assert (q['2000-01-17'] ==
                percentile(df.loc['2000-01-17'], 90))
        tm.assert_index_equal(q.index, df.index)

        # test degenerate case
        q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0)
        assert(np.isnan(q['x']) and np.isnan(q['y']))

        # non-numeric exclusion
        df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]})
        rs = df.quantile(0.5)
        xp = df.median().rename(0.5)
        assert_series_equal(rs, xp)

        # axis
        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
        result = df.quantile(.5, axis=1)
        expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
        assert_series_equal(result, expected)

        result = df.quantile([.5, .75], axis=1)
        expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75],
                              3: [3.5, 3.75]}, index=[0.5, 0.75])
        assert_frame_equal(result, expected, check_index_type=True)

        # We may want to break API in the future to change this
        # so that we exclude non-numeric along the same axis
        # See GH #7312
        df = DataFrame([[1, 2, 3],
                        ['a', 'b', 4]])
        result = df.quantile(.5, axis=1)
        expected = Series([3., 4.], index=[0, 1], name=0.5)
        assert_series_equal(result, expected)
Example #12
0
    def test_quantile_empty_no_rows_floats(self):

        # floats
        df = DataFrame(columns=["a", "b"], dtype="float64")

        res = df.quantile(0.5)
        exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5)
        tm.assert_series_equal(res, exp)

        res = df.quantile([0.5])
        exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5])
        tm.assert_frame_equal(res, exp)

        res = df.quantile(0.5, axis=1)
        exp = Series([], index=[], dtype="float64", name=0.5)
        tm.assert_series_equal(res, exp)

        res = df.quantile([0.5], axis=1)
        exp = DataFrame(columns=[], index=[0.5])
        tm.assert_frame_equal(res, exp)
Example #13
0
def unemployment_estimation(
        duration: pandas.DataFrame) -> dict[str, Union[str, int]]:
    """TODO: add docstring."""

    quantiles_values = list(_QUANTILES.values())
    quantiles = duration.quantile(quantiles_values)
    estimation: dict[str, Union[str, int]] = {}
    for name, quantile in _QUANTILES.items():
        estimation[f'{name}Days'] = int(
            typing.cast(float, quantiles.loc[quantile]))
    return finalize_duration_estimation(estimation)
Example #14
0
def get_IQR(df:pd.DataFrame, k):
    '''
    df : the original data
    k : the multiple of iqr for boundary
    '''
    q1 = df.quantile(0.25)
    q3 = df.quantile(0.75)
    iqr = q3 - q1
    lower_bound = pd.Series(q1 - (k * iqr), name='lower_bound')
    upper_bound = pd.Series(q3 + (k * iqr), name='upper_bound')
    return pd.concat([lower_bound, upper_bound], axis=1)
Example #15
0
    def test_quantile(self):
        from numpy import percentile

        q = self.tsframe.quantile(0.1, axis=0)
        self.assertEqual(q['A'], percentile(self.tsframe['A'], 10))
        q = self.tsframe.quantile(0.9, axis=1)
        q = self.intframe.quantile(0.1)
        self.assertEqual(q['A'], percentile(self.intframe['A'], 10))

        # test degenerate case
        q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0)
        assert (np.isnan(q['x']) and np.isnan(q['y']))

        # non-numeric exclusion
        df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]})
        rs = df.quantile(0.5)
        xp = df.median()
        assert_series_equal(rs, xp)

        # axis
        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
        result = df.quantile(.5, axis=1)
        expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3])
        assert_series_equal(result, expected)

        result = df.quantile([.5, .75], axis=1)
        expected = DataFrame({
            1: [1.5, 1.75],
            2: [2.5, 2.75],
            3: [3.5, 3.75]
        },
                             index=[0.5, 0.75])
        assert_frame_equal(result, expected, check_index_type=True)

        # We may want to break API in the future to change this
        # so that we exclude non-numeric along the same axis
        # See GH #7312
        df = DataFrame([[1, 2, 3], ['a', 'b', 4]])
        result = df.quantile(.5, axis=1)
        expected = Series([3., 4.], index=[0, 1])
        assert_series_equal(result, expected)
Example #16
0
    def test_quantile_empty_no_rows_dt64(self):
        # datetimes
        df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]")

        res = df.quantile(0.5, numeric_only=False)
        exp = Series(
            [pd.NaT, pd.NaT], index=["a", "b"], dtype="datetime64[ns]", name=0.5
        )
        tm.assert_series_equal(res, exp)

        # Mixed dt64/dt64tz
        df["a"] = df["a"].dt.tz_localize("US/Central")
        res = df.quantile(0.5, numeric_only=False)
        exp = exp.astype(object)
        tm.assert_series_equal(res, exp)

        # both dt64tz
        df["b"] = df["b"].dt.tz_localize("US/Central")
        res = df.quantile(0.5, numeric_only=False)
        exp = exp.astype(df["b"].dtype)
        tm.assert_series_equal(res, exp)
Example #17
0
def test_rolling_quantile_np_percentile():
    # #9413: Tests that rolling window's quantile default behavior
    # is analogous to Numpy's percentile
    row = 10
    col = 5
    idx = pd.date_range("20100101", periods=row, freq="B")
    df = DataFrame(np.random.rand(row * col).reshape((row, -1)), index=idx)

    df_quantile = df.quantile([0.25, 0.5, 0.75], axis=0)
    np_percentile = np.percentile(df, [25, 50, 75], axis=0)

    tm.assert_almost_equal(df_quantile.values, np.array(np_percentile))
Example #18
0
    def test_quantile_datetime(self):
        df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]})

        # exclude datetime
        result = df.quantile(.5)
        expected = Series([2.5], index=['b'])

        # datetime
        result = df.quantile(.5, numeric_only=False)
        expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5],
                          index=['a', 'b'],
                          name=0.5)
        assert_series_equal(result, expected)

        # datetime w/ multi
        result = df.quantile([.5], numeric_only=False)
        expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]],
                             index=[.5],
                             columns=['a', 'b'])
        assert_frame_equal(result, expected)

        # axis = 1
        df['c'] = pd.to_datetime(['2011', '2012'])
        result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False)
        expected = Series([
            Timestamp('2010-07-02 12:00:00'),
            Timestamp('2011-07-02 12:00:00')
        ],
                          index=[0, 1],
                          name=0.5)
        assert_series_equal(result, expected)

        result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False)
        expected = DataFrame([[
            Timestamp('2010-07-02 12:00:00'),
            Timestamp('2011-07-02 12:00:00')
        ]],
                             index=[0.5],
                             columns=[0, 1])
        assert_frame_equal(result, expected)
Example #19
0
    def test_quantile_nan(self):

        # GH 14357 - float block where some cols have missing values
        df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)})
        df.iloc[-1, 1] = np.nan

        res = df.quantile(0.5)
        exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5)
        tm.assert_series_equal(res, exp)

        res = df.quantile([0.5, 0.75])
        exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75])
        tm.assert_frame_equal(res, exp)

        res = df.quantile(0.5, axis=1)
        exp = Series(np.arange(1.0, 6.0), name=0.5)
        tm.assert_series_equal(res, exp)

        res = df.quantile([0.5, 0.75], axis=1)
        exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
        tm.assert_frame_equal(res, exp)

        # full-nan column
        df['b'] = np.nan

        res = df.quantile(0.5)
        exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5)
        tm.assert_series_equal(res, exp)

        res = df.quantile([0.5, 0.75])
        exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]},
                        index=[0.5, 0.75])
        tm.assert_frame_equal(res, exp)
Example #20
0
    def test_quantile_nan(self):

        # GH 14357 - float block where some cols have missing values
        df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)})
        df.iloc[-1, 1] = np.nan

        res = df.quantile(0.5)
        exp = Series([3.0, 2.5], index=["a", "b"], name=0.5)
        tm.assert_series_equal(res, exp)

        res = df.quantile([0.5, 0.75])
        exp = DataFrame({"a": [3.0, 4.0], "b": [2.5, 3.25]}, index=[0.5, 0.75])
        tm.assert_frame_equal(res, exp)

        res = df.quantile(0.5, axis=1)
        exp = Series(np.arange(1.0, 6.0), name=0.5)
        tm.assert_series_equal(res, exp)

        res = df.quantile([0.5, 0.75], axis=1)
        exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
        tm.assert_frame_equal(res, exp)

        # full-nan column
        df["b"] = np.nan

        res = df.quantile(0.5)
        exp = Series([3.0, np.nan], index=["a", "b"], name=0.5)
        tm.assert_series_equal(res, exp)

        res = df.quantile([0.5, 0.75])
        exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75])
        tm.assert_frame_equal(res, exp)
Example #21
0
    def test_quantile_date_range(self):
        # GH 2460

        dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")
        ser = Series(dti)
        df = DataFrame(ser)

        result = df.quantile(numeric_only=False)
        expected = Series(
            ["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]"
        )

        tm.assert_series_equal(result, expected)
Example #22
0
    def test_numeric_only_default_false_warning(self, non_num_col):
        # GH #7308
        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]})
        df["C"] = non_num_col

        expected = Series(
            [2.0, 3.0],
            index=["A", "B"],
            name=0.5,
        )
        with tm.assert_produces_warning(FutureWarning, match="numeric_only"):
            result = df.quantile(0.5)
        tm.assert_series_equal(result, expected)
Example #23
0
    def test_quantile_multi(self):
        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
        result = df.quantile([0.25, 0.5])
        expected = DataFrame(
            [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
            index=[0.25, 0.5],
            columns=["a", "b", "c"],
        )
        assert_frame_equal(result, expected)

        # axis = 1
        result = df.quantile([0.25, 0.5], axis=1)
        expected = DataFrame(
            [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], index=[0.25, 0.5], columns=[0, 1, 2]
        )

        # empty
        result = DataFrame({"x": [], "y": []}).quantile([0.1, 0.9], axis=0)
        expected = DataFrame(
            {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9]
        )
        assert_frame_equal(result, expected)
Example #24
0
    def test_quantile_datetime(self):
        df = DataFrame({"a": pd.to_datetime(["2010", "2011"]), "b": [0, 5]})

        # exclude datetime
        result = df.quantile(0.5)
        expected = Series([2.5], index=["b"])

        # datetime
        result = df.quantile(0.5, numeric_only=False)
        expected = Series(
            [Timestamp("2010-07-02 12:00:00"), 2.5], index=["a", "b"], name=0.5
        )
        assert_series_equal(result, expected)

        # datetime w/ multi
        result = df.quantile([0.5], numeric_only=False)
        expected = DataFrame(
            [[Timestamp("2010-07-02 12:00:00"), 2.5]], index=[0.5], columns=["a", "b"]
        )
        assert_frame_equal(result, expected)

        # axis = 1
        df["c"] = pd.to_datetime(["2011", "2012"])
        result = df[["a", "c"]].quantile(0.5, axis=1, numeric_only=False)
        expected = Series(
            [Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")],
            index=[0, 1],
            name=0.5,
        )
        assert_series_equal(result, expected)

        result = df[["a", "c"]].quantile([0.5], axis=1, numeric_only=False)
        expected = DataFrame(
            [[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")]],
            index=[0.5],
            columns=[0, 1],
        )
        assert_frame_equal(result, expected)
Example #25
0
    def test_quantile_axis_parameter(self):
        # GH 9543/9544

        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])

        result = df.quantile(.5, axis=0)

        expected = Series([2., 3.], index=["A", "B"], name=0.5)
        assert_series_equal(result, expected)

        expected = df.quantile(.5, axis="index")
        assert_series_equal(result, expected)

        result = df.quantile(.5, axis=1)

        expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
        assert_series_equal(result, expected)

        result = df.quantile(.5, axis="columns")
        assert_series_equal(result, expected)

        pytest.raises(ValueError, df.quantile, 0.1, axis=-1)
        pytest.raises(ValueError, df.quantile, 0.1, axis="column")
Example #26
0
    def test_quantile(self):
        # GH 17386
        data = [[1, 1], [2, 10], [3, 100], [nan, nan]]
        q = 0.1

        sparse_df = SparseDataFrame(data)
        result = sparse_df.quantile(q)

        dense_df = DataFrame(data)
        dense_expected = dense_df.quantile(q)
        sparse_expected = SparseSeries(dense_expected)

        tm.assert_series_equal(result, dense_expected)
        tm.assert_sp_series_equal(result, sparse_expected)
Example #27
0
    def test_quantile_multi(self):
        # GH 17386
        data = [[1, 1], [2, 10], [3, 100], [nan, nan]]
        q = [0.1, 0.5]

        sparse_df = SparseDataFrame(data)
        result = sparse_df.quantile(q)

        dense_df = DataFrame(data)
        dense_expected = dense_df.quantile(q)
        sparse_expected = SparseDataFrame(dense_expected)

        tm.assert_frame_equal(result, dense_expected)
        tm.assert_sp_frame_equal(result, sparse_expected)
Example #28
0
    def test_quantile_axis_parameter(self):
        # GH 9543/9544

        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])

        result = df.quantile(.5, axis=0)

        expected = Series([2., 3.], index=["A", "B"], name=0.5)
        assert_series_equal(result, expected)

        expected = df.quantile(.5, axis="index")
        assert_series_equal(result, expected)

        result = df.quantile(.5, axis=1)

        expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
        assert_series_equal(result, expected)

        result = df.quantile(.5, axis="columns")
        assert_series_equal(result, expected)

        pytest.raises(ValueError, df.quantile, 0.1, axis=-1)
        pytest.raises(ValueError, df.quantile, 0.1, axis="column")
Example #29
0
def _create_summary(data: pd.DataFrame, original):
    summary = pd.DataFrame(0,
                           index=data.columns,
                           columns=[
                               "original", "mean", "std.error", "perc.025",
                               "perc.975", "t stat."
                           ])
    summary.loc[:, "mean"] = data.mean(axis=0)
    summary.loc[:, "std.error"] = data.std(axis=0)
    summary.loc[:, "perc.025"] = data.quantile(0.025, axis=0)
    summary.loc[:, "perc.975"] = data.quantile(0.975, axis=0)
    summary.loc[:, "original"] = original
    summary.loc[:, "t stat."] = original / data.std(axis=0)
    return summary
Example #30
0
def test_quantile():
    # GH 17386
    data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]]
    q = 0.1

    sparse_df = SparseDataFrame(data)
    result = sparse_df.quantile(q)

    dense_df = DataFrame(data)
    dense_expected = dense_df.quantile(q)
    sparse_expected = SparseSeries(dense_expected)

    tm.assert_series_equal(result, dense_expected)
    tm.assert_sp_series_equal(result, sparse_expected)
Example #31
0
def test_quantile_multi():
    # GH 17386
    data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]]
    q = [0.1, 0.5]

    sparse_df = SparseDataFrame(data)
    result = sparse_df.quantile(q)

    dense_df = DataFrame(data)
    dense_expected = dense_df.quantile(q)
    sparse_expected = SparseDataFrame(dense_expected)

    tm.assert_frame_equal(result, dense_expected)
    tm.assert_sp_frame_equal(result, sparse_expected)
Example #32
0
    def test_quantile_multi(self):
        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
                       columns=['a', 'b', 'c'])
        result = df.quantile([.25, .5])
        expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
                             index=[.25, .5],
                             columns=['a', 'b', 'c'])
        assert_frame_equal(result, expected)

        # axis = 1
        result = df.quantile([.25, .5], axis=1)
        expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
                             index=[.25, .5],
                             columns=[0, 1, 2])

        # empty
        result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0)
        expected = DataFrame({
            'x': [np.nan, np.nan],
            'y': [np.nan, np.nan]
        },
                             index=[.1, .9])
        assert_frame_equal(result, expected)
 def testWordParser(self):
     '''
     try finding quantiles
     
     see https://stackoverflow.com/questions/2374640/how-do-i-calculate-percentiles-with-python-numpy
     '''
     lookup = Lookup("test Word parser")
     sqlDB = lookup.getSQLDB()
     if sqlDB is not None:
         totalWordUsages = []
         for source in ['wikidata', 'crossref', 'dblp', 'CEUR-WS']:
             listOfDicts = TestWordParser.getProceedingsTitles(
                 sqlDB, source)
             cwp = CorpusWordParser()
             wordusages = cwp.parse(listOfDicts)
             lens = {}
             for wordusage in wordusages:
                 totalWordUsages.append(wordusage.__dict__)
                 if wordusage.eventId in lens:
                     lens[wordusage.eventId] += 1
                 else:
                     lens[wordusage.eventId] = 1
             df = DataFrame(lens.values())
             print(df.quantile(1))
             quantileValues = df.quantile(.90)
             print(quantileValues)
             plot = Plot(lens.values(),
                         "%s wordcount histogram" % source,
                         xlabel="wordcount",
                         ylabel="frequency")
             plot.hist(mode='save')
         wordUsageDBFile = Lookup.getDBFile("wordusage")
         wSQLDB = SQLDB(wordUsageDBFile)
         entityInfo = wSQLDB.createTable(totalWordUsages,
                                         "wordusage",
                                         withDrop=True)
         wSQLDB.store(totalWordUsages, entityInfo)
Example #34
0
def get_outliers_iqr(df: DataFrame, iqr_mul=3) -> DataFrame:
    """
    Return upper and lower bound of outliers from pandas dataframe based on IQR indicator.
    :param df: Pandas dataFrame
    :param IQR_mul : IQR_mult > 1.5 - normal outliers and extreme outliers, IQR_mul > 3 - extreme outliers
    :return lower_outliers, upper outlliers :
    """
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1

    lower_outliers = df < (Q1 - iqr_mul * IQR)
    upper_outliers = df > (Q3 + iqr_mul * IQR)

    return lower_outliers, upper_outliers
Example #35
0
def bucketize(feature: pd.DataFrame, fc: tf.feature_column.numeric_column,
              n_bins: int):
    '''Bin pandas series in dataframe examples.

    Args:
      feature: pandas.Series
      fc: tensorflow.feature_column.numeric_column
      n_bins: int

    Returns:
      tensorflow.feature_column.bucketized_column
    '''

    qs = list(feature.quantile(np.linspace(0, 1, n_bins + 1)))
    return tf.feature_column.bucketized_column(fc, qs)
Example #36
0
def outliers(df: pd.DataFrame):
    q1 = df.quantile(0.25)
    q3 = df.quantile(0.75)
    iqr = q3 - q1
    low_boundary = (q1 - 1.5 * iqr)
    upp_boundary = (q3 + 1.5 * iqr)
    num_of_out_L = (df[iqr.index] < low_boundary).sum()
    num_of_out_U = (df[iqr.index] > upp_boundary).sum()
    outliers = pd.DataFrame({
        'lower_value': low_boundary,
        'upper_boundary': upp_boundary,
        'num_of_outliers_L': num_of_out_L,
        'num_of_outliers_U': num_of_out_U
    })
    print(outliers)
Example #37
0
def get_evaluation_stats(results: pd.DataFrame) -> pd.Series:
  """ Computes model result statistics.

  Args:
    results: Model evaluation results.

  Returns:
    Model statistics where each row includes a different
    statistic.
  """
  stats = results.describe()
  stats['95%'] = results.quantile(.95)
  stats['99%'] = results.quantile(.99)

  return stats
Example #38
0
    def test_quantile_axis_mixed(self):

        # mixed on axis=1
        df = DataFrame({"A": [1, 2, 3],
                        "B": [2., 3., 4.],
                        "C": pd.date_range('20130101', periods=3),
                        "D": ['foo', 'bar', 'baz']})
        result = df.quantile(.5, axis=1)
        expected = Series([1.5, 2.5, 3.5], name=0.5)
        assert_series_equal(result, expected)

        # must raise
        def f():
            df.quantile(.5, axis=1, numeric_only=False)
        self.assertRaises(TypeError, f)
def remove_outliers(dataset: pd.DataFrame, strategy='Z', reindex=True, threshold=3) -> \
        pd.DataFrame:
    """
    A method that removes outliers from a dataset that contains no null values. Two strategies
    can be used for outliers' removal: z-score and IQR. In case the dataset contains less than
    12 values only IQR strategy can be used.

    :param dataset: A dataset to remove outliers form containing no null values.
    :param strategy: A strategy for removal (Z or IQR).
    :param reindex: A new dataset will create new indexes if True.
    :param threshold: A threshold for a value to be considered outliers in case Z-score was chosen.
    :return: DataFrame containing no outliers.
    """

    if dataset.count()[0] < 12:
        strategy = 'IQR'

    if strategy.lower() == 'z':

        cols = list(dataset.columns)
        z_scores = pd.DataFrame()

        for col in cols:
            if np.issubdtype(dataset[col].dtype, np.number):
                col_zscore = col + '_zscore'
                z_scores[col_zscore] = np.abs(stats.zscore(dataset[col]))

        # noinspection PyTypeChecker
        no_outliers_dataset = dataset[(z_scores < threshold).all(axis=1)]

    else:

        first_quartile = dataset.quantile(0.25)
        third_quartile = dataset.quantile(0.75)
        iqr = third_quartile - first_quartile

        # noinspection PyTypeChecker
        no_outliers_dataset = dataset[~(
            (dataset < (third_quartile - 1.5 * iqr))
            | (dataset > (third_quartile + 1.5 * iqr))).any(axis=1)]

    no_outliers_dataset = no_outliers_dataset.reset_index(
        drop=True) if reindex else no_outliers_dataset

    return no_outliers_dataset
def _get_quantiles(df: pd.DataFrame,
                   feats: List[str],
                   filter_debug: bool = True,
                   filter_continue: bool = True) -> Dict[str, List[float]]:
    filter_strings = []
    if filter_debug:
        filter_strings += ['(debug==0)']
    if filter_continue:
        filter_strings += ['(c==0)']
    if filter_strings:
        df = df.rename({
            "continue": "c"
        }, axis=1).query(' & '.join(filter_strings)).rename({"c": "continue"},
                                                            axis=1)
    df = df[feats].replace(0.0, pd.NA)
    df = df.quantile(np.arange(0, 1, .01))
    quantiles = df.to_dict('list')
    return quantiles
Example #41
0
def get_outliers(df: pd.DataFrame) -> (int, str):
    """
    Vyfiltrujte odlehle hodnoty (outliers) ve sloupecku "Fare" pomoci metody IRQ.
    Tedy spocitejte rozdil 3. a 1. kvantilu, tj. IQR = Q3 - Q1.
    Pote odfiltrujte vsechny hodnoty nesplnujici: Q1 - 1.5*IQR < "Fare" < Q3 + 1.5*IQR.
    Namalujte box plot pro sloupec "Fare" pred a po vyfiltrovani outlieru.
    Vratte tuple obsahujici pocet outlieru a jmeno cestujiciho pro nejvetsi outlier.
    """
    Q1 = df.quantile(0.25)["Fare"]
    Q3 = df.quantile(0.75)["Fare"]
    IQR = Q3 - Q1
    df_out = df[~((df["Fare"] < (Q1 - 1.5 * IQR)) | (df["Fare"] >
                                                     (Q3 + 1.5 * IQR)))]
    df.boxplot(column="Fare")
    plt.show()
    df_out.boxplot(column="Fare")
    plt.show()
    return (len(df) - len(df_out), df.iloc[df.index[df["Fare"] == max(
        list(df["Fare"]))].tolist()[0]]["Name"])
Example #42
0
def calculate_quantiles(
    data: pd.DataFrame
) -> Tuple[List[str], ...]:
    """
    Calculate quantiles of a Pandas Series

    Args:
    -----
    - data: a pandas DataFrame

    Return:
    -------
    A tuple(list(str...))
    """
    q0, q25, q50, q75, q100 = data.quantile([.0, .25, .50, .75, 1.0]).values
    q0 = [str(value) for value in q0]
    q25 = [str(value) for value in q25]
    q50 = [str(value) for value in q50]
    q75 = [str(value) for value in q75]
    q100 = [str(value) for value in q100]
    return (q0, q25, q50, q75, q100)
Example #43
0
    def fit(
        self: T_Self,
        X: pd.DataFrame,
        y: Optional[Union[pd.Series, pd.DataFrame]] = None,
        **fit_params,
    ) -> T_Self:
        """
        Fit the transformer.

        :return: the fitted transformer
        """

        self: OutlierRemoverDF  # support type hinting in PyCharm

        q1: pd.Series = X.quantile(q=0.25)
        q3: pd.Series = X.quantile(q=0.75)
        threshold_iqr: pd.Series = (q3 - q1) * self.iqr_multiple
        self.threshold_low_ = q1 - threshold_iqr
        self.threshold_high_ = q3 + threshold_iqr
        self._features_original = X.columns.to_series()
        return self
Example #44
0
    def test_quantile_interpolation_np_lt_1p9(self):
        # GH #10174
        if not _np_version_under1p9:
            raise nose.SkipTest("Numpy version is greater than 1.9")

        from numpy import percentile

        # interpolation = linear (default case)
        q = self.tsframe.quantile(0.1, axis=0, interpolation='linear')
        self.assertEqual(q['A'], percentile(self.tsframe['A'], 10))
        q = self.intframe.quantile(0.1)
        self.assertEqual(q['A'], percentile(self.intframe['A'], 10))

        # test with and without interpolation keyword
        q1 = self.intframe.quantile(0.1)
        self.assertEqual(q1['A'], np.percentile(self.intframe['A'], 10))
        assert_series_equal(q, q1)

        # interpolation method other than default linear
        expErrMsg = "Interpolation methods other than linear"
        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
        with assertRaisesRegexp(ValueError, expErrMsg):
            df.quantile(.5, axis=1, interpolation='nearest')

        with assertRaisesRegexp(ValueError, expErrMsg):
            df.quantile([.5, .75], axis=1, interpolation='lower')

        # test degenerate case
        df = DataFrame({'x': [], 'y': []})
        with assertRaisesRegexp(ValueError, expErrMsg):
            q = df.quantile(0.1, axis=0, interpolation='higher')

        # multi
        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
                       columns=['a', 'b', 'c'])
        with assertRaisesRegexp(ValueError, expErrMsg):
            df.quantile([.25, .5], interpolation='midpoint')
Example #45
0
    def test_quantile_box(self):
        df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
                              pd.Timestamp('2011-01-02'),
                              pd.Timestamp('2011-01-03')],
                        'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
                              pd.Timestamp('2011-01-02', tz='US/Eastern'),
                              pd.Timestamp('2011-01-03', tz='US/Eastern')],
                        'C': [pd.Timedelta('1 days'),
                              pd.Timedelta('2 days'),
                              pd.Timedelta('3 days')]})

        res = df.quantile(0.5, numeric_only=False)

        exp = pd.Series([pd.Timestamp('2011-01-02'),
                         pd.Timestamp('2011-01-02', tz='US/Eastern'),
                         pd.Timedelta('2 days')],
                        name=0.5, index=['A', 'B', 'C'])
        tm.assert_series_equal(res, exp)

        res = df.quantile([0.5], numeric_only=False)
        exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
                             pd.Timestamp('2011-01-02', tz='US/Eastern'),
                             pd.Timedelta('2 days')]],
                           index=[0.5], columns=['A', 'B', 'C'])
        tm.assert_frame_equal(res, exp)

        # DatetimeBlock may be consolidated and contain NaT in different loc
        df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
                              pd.NaT,
                              pd.Timestamp('2011-01-02'),
                              pd.Timestamp('2011-01-03')],
                        'a': [pd.Timestamp('2011-01-01'),
                              pd.Timestamp('2011-01-02'),
                              pd.NaT,
                              pd.Timestamp('2011-01-03')],
                        'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
                              pd.NaT,
                              pd.Timestamp('2011-01-02', tz='US/Eastern'),
                              pd.Timestamp('2011-01-03', tz='US/Eastern')],
                        'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
                              pd.Timestamp('2011-01-02', tz='US/Eastern'),
                              pd.NaT,
                              pd.Timestamp('2011-01-03', tz='US/Eastern')],
                        'C': [pd.Timedelta('1 days'),
                              pd.Timedelta('2 days'),
                              pd.Timedelta('3 days'),
                              pd.NaT],
                        'c': [pd.NaT,
                              pd.Timedelta('1 days'),
                              pd.Timedelta('2 days'),
                              pd.Timedelta('3 days')]},
                       columns=list('AaBbCc'))

        res = df.quantile(0.5, numeric_only=False)
        exp = pd.Series([pd.Timestamp('2011-01-02'),
                         pd.Timestamp('2011-01-02'),
                         pd.Timestamp('2011-01-02', tz='US/Eastern'),
                         pd.Timestamp('2011-01-02', tz='US/Eastern'),
                         pd.Timedelta('2 days'),
                         pd.Timedelta('2 days')],
                        name=0.5, index=list('AaBbCc'))
        tm.assert_series_equal(res, exp)

        res = df.quantile([0.5], numeric_only=False)
        exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
                             pd.Timestamp('2011-01-02'),
                             pd.Timestamp('2011-01-02', tz='US/Eastern'),
                             pd.Timestamp('2011-01-02', tz='US/Eastern'),
                             pd.Timedelta('2 days'),
                             pd.Timedelta('2 days')]],
                           index=[0.5], columns=list('AaBbCc'))
        tm.assert_frame_equal(res, exp)