Example #1
0
    def test_getitem_setitem_datetime_tz_pytz(self):
        tm._skip_if_no_pytz()
        from pytz import timezone as tz

        from pandas import date_range

        N = 50
        # testing with timezone, GH #2785
        rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern')
        ts = Series(np.random.randn(N), index=rng)

        # also test Timestamp tz handling, GH #2789
        result = ts.copy()
        result["1990-01-01 09:00:00+00:00"] = 0
        result["1990-01-01 09:00:00+00:00"] = ts[4]
        assert_series_equal(result, ts)

        result = ts.copy()
        result["1990-01-01 03:00:00-06:00"] = 0
        result["1990-01-01 03:00:00-06:00"] = ts[4]
        assert_series_equal(result, ts)

        # repeat with datetimes
        result = ts.copy()
        result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0
        result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4]
        assert_series_equal(result, ts)

        result = ts.copy()

        # comparison dates with datetime MUST be localized!
        date = tz('US/Central').localize(datetime(1990, 1, 1, 3))
        result[date] = 0
        result[date] = ts[4]
        assert_series_equal(result, ts)
Example #2
0
    def test_set_axis_inplace(self):
        # GH14636

        s = Series(np.arange(4), index=[1, 3, 5, 7], dtype='int64')

        expected = s.copy()
        expected.index = list('abcd')

        for axis in 0, 'index':
            # inplace=True
            # The FutureWarning comes from the fact that we would like to have
            # inplace default to False some day
            for inplace, warn in (None, FutureWarning), (True, None):
                result = s.copy()
                kwargs = {'inplace': inplace}
                with tm.assert_produces_warning(warn):
                    result.set_axis(list('abcd'), axis=axis, **kwargs)
                tm.assert_series_equal(result, expected)

        # inplace=False
        result = s.set_axis(list('abcd'), axis=0, inplace=False)
        tm.assert_series_equal(expected, result)

        # omitting the "axis" parameter
        with tm.assert_produces_warning(None):
            result = s.set_axis(list('abcd'), inplace=False)
        tm.assert_series_equal(result, expected)

        # wrong values for the "axis" parameter
        for axis in 2, 'foo':
            with tm.assert_raises_regex(ValueError, 'No axis named'):
                s.set_axis(list('abcd'), axis=axis, inplace=False)
Example #3
0
    def test_getitem_setitem_datetime_tz_dateutil(self):
        tm._skip_if_no_dateutil()
        from dateutil.tz import tzutc
        from pandas.tslib import _dateutil_gettz as gettz

        tz = lambda x: tzutc() if x == 'UTC' else gettz(
            x)  # handle special case for utc in dateutil

        from pandas import date_range
        N = 50
        # testing with timezone, GH #2785
        rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern')
        ts = Series(np.random.randn(N), index=rng)

        # also test Timestamp tz handling, GH #2789
        result = ts.copy()
        result["1990-01-01 09:00:00+00:00"] = 0
        result["1990-01-01 09:00:00+00:00"] = ts[4]
        assert_series_equal(result, ts)

        result = ts.copy()
        result["1990-01-01 03:00:00-06:00"] = 0
        result["1990-01-01 03:00:00-06:00"] = ts[4]
        assert_series_equal(result, ts)

        # repeat with datetimes
        result = ts.copy()
        result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0
        result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4]
        assert_series_equal(result, ts)

        result = ts.copy()
        result[datetime(1990, 1, 1, 3, tzinfo=tz('US/Central'))] = 0
        result[datetime(1990, 1, 1, 3, tzinfo=tz('US/Central'))] = ts[4]
        assert_series_equal(result, ts)
Example #4
0
def test_loc_getitem_setitem_integer_slice_keyerrors():
    s = Series(np.random.randn(10), index=lrange(0, 20, 2))

    # this is OK
    cp = s.copy()
    cp.iloc[4:10] = 0
    assert (cp.iloc[4:10] == 0).all()

    # so is this
    cp = s.copy()
    cp.iloc[3:11] = 0
    assert (cp.iloc[3:11] == 0).values.all()

    result = s.iloc[2:6]
    result2 = s.loc[3:11]
    expected = s.reindex([4, 6, 8, 10])

    assert_series_equal(result, expected)
    assert_series_equal(result2, expected)

    # non-monotonic, raise KeyError
    s2 = s.iloc[lrange(5) + lrange(5, 10)[::-1]]
    with pytest.raises(KeyError, match=r"^3L?$"):
        s2.loc[3:11]
    with pytest.raises(KeyError, match=r"^3L?$"):
        s2.loc[3:11] = 0
Example #5
0
def test_categorial_assigning_ops():
    orig = Series(Categorical(["b", "b"], categories=["a", "b"]))
    s = orig.copy()
    s[:] = "a"
    exp = Series(Categorical(["a", "a"], categories=["a", "b"]))
    tm.assert_series_equal(s, exp)

    s = orig.copy()
    s[1] = "a"
    exp = Series(Categorical(["b", "a"], categories=["a", "b"]))
    tm.assert_series_equal(s, exp)

    s = orig.copy()
    s[s.index > 0] = "a"
    exp = Series(Categorical(["b", "a"], categories=["a", "b"]))
    tm.assert_series_equal(s, exp)

    s = orig.copy()
    s[[False, True]] = "a"
    exp = Series(Categorical(["b", "a"], categories=["a", "b"]))
    tm.assert_series_equal(s, exp)

    s = orig.copy()
    s.index = ["x", "y"]
    s["y"] = "a"
    exp = Series(Categorical(["b", "a"], categories=["a", "b"]),
                 index=["x", "y"])
    tm.assert_series_equal(s, exp)

    # ensure that one can set something to np.nan
    s = Series(Categorical([1, 2, 3]))
    exp = Series(Categorical([1, np.nan, 3], categories=[1, 2, 3]))
    s[1] = np.nan
    tm.assert_series_equal(s, exp)
Example #6
0
    def test_inplace_ops_identity(self):

        # GH 5104
        # make sure that we are actually changing the object
        s_orig = Series([1, 2, 3])
        df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5))

        # no dtype change
        s = s_orig.copy()
        s2 = s
        s += 1
        assert_series_equal(s, s2)
        assert_series_equal(s_orig + 1, s)
        assert s is s2
        assert s._data is s2._data

        df = df_orig.copy()
        df2 = df
        df += 1
        assert_frame_equal(df, df2)
        assert_frame_equal(df_orig + 1, df)
        assert df is df2
        assert df._data is df2._data

        # dtype change
        s = s_orig.copy()
        s2 = s
        s += 1.5
        assert_series_equal(s, s2)
        assert_series_equal(s_orig + 1.5, s)

        df = df_orig.copy()
        df2 = df
        df += 1.5
        assert_frame_equal(df, df2)
        assert_frame_equal(df_orig + 1.5, df)
        assert df is df2
        assert df._data is df2._data

        # mixed dtype
        arr = np.random.randint(0, 10, size=5)
        df_orig = DataFrame({'A': arr.copy(), 'B': 'foo'})
        df = df_orig.copy()
        df2 = df
        df['A'] += 1
        expected = DataFrame({'A': arr.copy() + 1, 'B': 'foo'})
        assert_frame_equal(df, expected)
        assert_frame_equal(df2, expected)
        assert df._data is df2._data

        df = df_orig.copy()
        df2 = df
        df['A'] += 1.5
        expected = DataFrame({'A': arr.copy() + 1.5, 'B': 'foo'})
        assert_frame_equal(df, expected)
        assert_frame_equal(df2, expected)
        assert df._data is df2._data
Example #7
0
    def test_inplace_ops_identity(self):

        # GH 5104
        # make sure that we are actually changing the object
        s_orig = Series([1, 2, 3])
        df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5))

        # no dtype change
        s = s_orig.copy()
        s2 = s
        s += 1
        assert_series_equal(s, s2)
        assert_series_equal(s_orig + 1, s)
        self.assertIs(s, s2)
        self.assertIs(s._data, s2._data)

        df = df_orig.copy()
        df2 = df
        df += 1
        assert_frame_equal(df, df2)
        assert_frame_equal(df_orig + 1, df)
        self.assertIs(df, df2)
        self.assertIs(df._data, df2._data)

        # dtype change
        s = s_orig.copy()
        s2 = s
        s += 1.5
        assert_series_equal(s, s2)
        assert_series_equal(s_orig + 1.5, s)

        df = df_orig.copy()
        df2 = df
        df += 1.5
        assert_frame_equal(df, df2)
        assert_frame_equal(df_orig + 1.5, df)
        self.assertIs(df, df2)
        self.assertIs(df._data, df2._data)

        # mixed dtype
        arr = np.random.randint(0, 10, size=5)
        df_orig = DataFrame({"A": arr.copy(), "B": "foo"})
        df = df_orig.copy()
        df2 = df
        df["A"] += 1
        expected = DataFrame({"A": arr.copy() + 1, "B": "foo"})
        assert_frame_equal(df, expected)
        assert_frame_equal(df2, expected)
        self.assertIs(df._data, df2._data)

        df = df_orig.copy()
        df2 = df
        df["A"] += 1.5
        expected = DataFrame({"A": arr.copy() + 1.5, "B": "foo"})
        assert_frame_equal(df, expected)
        assert_frame_equal(df2, expected)
        self.assertIs(df._data, df2._data)
Example #8
0
def test_mask_inplace():
    s = Series(np.random.randn(5))
    cond = s > 0

    rs = s.copy()
    rs.mask(cond, inplace=True)
    assert_series_equal(rs.dropna(), s[~cond])
    assert_series_equal(rs, s.mask(cond))

    rs = s.copy()
    rs.mask(cond, -s, inplace=True)
    assert_series_equal(rs, s.mask(cond, -s))
Example #9
0
    def test_iloc_setitem_pandas_object(self):
        # GH 17193, affecting old numpy (1.7 and 1.8)
        s_orig = Series([0, 1, 2, 3])
        expected = Series([0, -1, -2, 3])

        s = s_orig.copy()
        s.iloc[Series([1, 2])] = [-1, -2]
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s.iloc[pd.Index([1, 2])] = [-1, -2]
        tm.assert_series_equal(s, expected)
Example #10
0
def test_setitem_ambiguous_keyerror():
    s = Series(lrange(10), index=lrange(0, 20, 2))

    # equivalent of an append
    s2 = s.copy()
    s2[1] = 5
    expected = s.append(Series([5], index=[1]))
    assert_series_equal(s2, expected)

    s2 = s.copy()
    s2.loc[1] = 5
    expected = s.append(Series([5], index=[1]))
    assert_series_equal(s2, expected)
Example #11
0
    def test_indexing_with_datetimeindex_tz(self):

        # GH 12050
        # indexing on a series with a datetimeindex with tz
        index = date_range('2015-01-01', periods=2, tz='utc')

        ser = Series(range(2), index=index, dtype='int64')

        # list-like indexing

        for sel in (index, list(index)):
            # getitem
            tm.assert_series_equal(ser[sel], ser)

            # setitem
            result = ser.copy()
            result[sel] = 1
            expected = Series(1, index=index)
            tm.assert_series_equal(result, expected)

            # .loc getitem
            tm.assert_series_equal(ser.loc[sel], ser)

            # .loc setitem
            result = ser.copy()
            result.loc[sel] = 1
            expected = Series(1, index=index)
            tm.assert_series_equal(result, expected)

        # single element indexing

        # getitem
        assert ser[index[1]] == 1

        # setitem
        result = ser.copy()
        result[index[1]] = 5
        expected = Series([0, 5], index=index)
        tm.assert_series_equal(result, expected)

        # .loc getitem
        assert ser.loc[index[1]] == 1

        # .loc setitem
        result = ser.copy()
        result.loc[index[1]] = 5
        expected = Series([0, 5], index=index)
        tm.assert_series_equal(result, expected)
Example #12
0
    def testSeriesNested(self):
        s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15])
        s.sort()

        nested = {'s1': s, 's2': s.copy()}

        exp = {'s1': ujson.decode(ujson.encode(s)),
               's2': ujson.decode(ujson.encode(s))}
        self.assertTrue(ujson.decode(ujson.encode(nested)) == exp)

        exp = {'s1': ujson.decode(ujson.encode(s, orient="split")),
               's2': ujson.decode(ujson.encode(s, orient="split"))}
        self.assertTrue(ujson.decode(ujson.encode(nested, orient="split")) == exp)

        exp = {'s1': ujson.decode(ujson.encode(s, orient="records")),
               's2': ujson.decode(ujson.encode(s, orient="records"))}
        self.assertTrue(ujson.decode(ujson.encode(nested, orient="records")) == exp)

        exp = {'s1': ujson.decode(ujson.encode(s, orient="values")),
               's2': ujson.decode(ujson.encode(s, orient="values"))}
        self.assertTrue(ujson.decode(ujson.encode(nested, orient="values")) == exp)

        exp = {'s1': ujson.decode(ujson.encode(s, orient="index")),
               's2': ujson.decode(ujson.encode(s, orient="index"))}
        self.assertTrue(ujson.decode(ujson.encode(nested, orient="index")) == exp)
def para_keys_modify(para):
    content=np.array(para)
    content=np.float64(content)
    content=Series(content)
    content1=content.copy()
    for i in range(1,41):
        content1[i*2+1]=content[i]
    content1[1]=content[41]
    for i in range(42,82):
        content1[(i-41)*2]=content[i]
    content1=content1.values
    return content1


#def truepara_key_modify():
#    root_directory=r'E:\EnRML_Gas_Modelling\true_obs'
#    with open(r'E:\EnRML_Gas_Modelling\true_obs\para_true.txt','r') as f:
#        content=f.readlines()
#
#    content=np.array(content)
#    content=np.float64(content)
#    content=Series(content)
#    content1=content.copy()
#    for i in range(1,41):
#        content1[i*2+1]=content[i]
#    content1[1]=content[41]
#
#    for i in range(42,82):
#        content1[(i-41)*2]=content[i]
#
#    para_distribution_map(content1,1681,root_directory)
#    np.savetxt(r'E:\EnRML_Gas_Modelling\true_obs\para_true.txt',content1)
Example #14
0
    def test_constructor_with_datetimelike(self, dtl):
        # see gh-12077
        # constructor with a datetimelike and NaT

        s = Series(dtl)
        c = Categorical(s)

        expected = type(dtl)(s)
        expected.freq = None

        tm.assert_index_equal(c.categories, expected)
        tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype="int8"))

        # with NaT
        s2 = s.copy()
        s2.iloc[-1] = NaT
        c = Categorical(s2)

        expected = type(dtl)(s2.dropna())
        expected.freq = None

        tm.assert_index_equal(c.categories, expected)

        exp = np.array([0, 1, 2, 3, -1], dtype=np.int8)
        tm.assert_numpy_array_equal(c.codes, exp)

        result = repr(c)
        assert "NaT" in result
Example #15
0
    def test_set_axis_inplace_axes(self, axis_series):
        # GH14636
        ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype='int64')

        expected = ser.copy()
        expected.index = list('abcd')

        # inplace=True
        # The FutureWarning comes from the fact that we would like to have
        # inplace default to False some day
        for inplace, warn in [(None, FutureWarning), (True, None)]:
            result = ser.copy()
            kwargs = {'inplace': inplace}
            with tm.assert_produces_warning(warn):
                result.set_axis(list('abcd'), axis=axis_series, **kwargs)
            tm.assert_series_equal(result, expected)
Example #16
0
    def test_constructor_with_datetimelike(self):

        # 12077
        # constructor wwth a datetimelike and NaT

        for dtl in [date_range('1995-01-01 00:00:00', periods=5, freq='s'),
                    date_range('1995-01-01 00:00:00', periods=5,
                               freq='s', tz='US/Eastern'),
                    timedelta_range('1 day', periods=5, freq='s')]:

            s = Series(dtl)
            c = Categorical(s)
            expected = type(dtl)(s)
            expected.freq = None
            tm.assert_index_equal(c.categories, expected)
            tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype='int8'))

            # with NaT
            s2 = s.copy()
            s2.iloc[-1] = NaT
            c = Categorical(s2)
            expected = type(dtl)(s2.dropna())
            expected.freq = None
            tm.assert_index_equal(c.categories, expected)

            exp = np.array([0, 1, 2, 3, -1], dtype=np.int8)
            tm.assert_numpy_array_equal(c.codes, exp)

            result = repr(c)
            assert 'NaT' in result
Example #17
0
    def test_timedelta64_nan(self):

        from pandas import tslib

        td = Series([timedelta(days=i) for i in range(10)])

        # nan ops on timedeltas
        td1 = td.copy()
        td1[0] = np.nan
        self.assertTrue(isnull(td1[0]))
        self.assertEqual(td1[0].value, tslib.iNaT)
        td1[0] = td[0]
        self.assertFalse(isnull(td1[0]))

        td1[1] = tslib.iNaT
        self.assertTrue(isnull(td1[1]))
        self.assertEqual(td1[1].value, tslib.iNaT)
        td1[1] = td[1]
        self.assertFalse(isnull(td1[1]))

        td1[2] = tslib.NaT
        self.assertTrue(isnull(td1[2]))
        self.assertEqual(td1[2].value, tslib.iNaT)
        td1[2] = td[2]
        self.assertFalse(isnull(td1[2]))
Example #18
0
    def test_to_period(self):
        from pandas.tseries.period import period_range

        ts = _simple_ts('1/1/2000', '1/1/2001')

        pts = ts.to_period()
        exp = ts.copy()
        exp.index = period_range('1/1/2000', '1/1/2001')
        assert_series_equal(pts, exp)

        pts = ts.to_period('M')
        exp.index = exp.index.asfreq('M')
        tm.assert_index_equal(pts.index, exp.index.asfreq('M'))
        assert_series_equal(pts, exp)

        # GH 7606 without freq
        idx = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03',
                             '2011-01-04'])
        exp_idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03',
                                  '2011-01-04'], freq='D')

        s = Series(np.random.randn(4), index=idx)
        expected = s.copy()
        expected.index = exp_idx
        assert_series_equal(s.to_period(), expected)

        df = DataFrame(np.random.randn(4, 4), index=idx, columns=idx)
        expected = df.copy()
        expected.index = exp_idx
        assert_frame_equal(df.to_period(), expected)

        expected = df.copy()
        expected.columns = exp_idx
        assert_frame_equal(df.to_period(axis=1), expected)
Example #19
0
    def test_fillna_consistency(self):
        # GH 16402
        # fillna with a tz aware to a tz-naive, should result in object

        s = Series([Timestamp('20130101'), pd.NaT])

        result = s.fillna(Timestamp('20130101', tz='US/Eastern'))
        expected = Series([Timestamp('20130101'),
                           Timestamp('2013-01-01', tz='US/Eastern')],
                          dtype='object')
        assert_series_equal(result, expected)

        # where (we ignore the errors=)
        result = s.where([True, False],
                         Timestamp('20130101', tz='US/Eastern'),
                         errors='ignore')
        assert_series_equal(result, expected)

        result = s.where([True, False],
                         Timestamp('20130101', tz='US/Eastern'),
                         errors='ignore')
        assert_series_equal(result, expected)

        # with a non-datetime
        result = s.fillna('foo')
        expected = Series([Timestamp('20130101'),
                           'foo'])
        assert_series_equal(result, expected)

        # assignment
        s2 = s.copy()
        s2[1] = 'foo'
        assert_series_equal(s2, expected)
Example #20
0
    def test_copy(self):

        for deep in [None, False, True]:
            s = Series(np.arange(10), dtype='float64')

            # default deep is True
            if deep is None:
                s2 = s.copy()
            else:
                s2 = s.copy(deep=deep)

            s2[::2] = np.NaN

            if deep is None or deep is True:
                # Did not modify original Series
                assert np.isnan(s2[0])
                assert not np.isnan(s[0])
            else:
                # we DID modify the original Series
                assert np.isnan(s2[0])
                assert np.isnan(s[0])

        # GH 11794
        # copy of tz-aware
        expected = Series([Timestamp('2012/01/01', tz='UTC')])
        expected2 = Series([Timestamp('1999/01/01', tz='UTC')])

        for deep in [None, False, True]:

            s = Series([Timestamp('2012/01/01', tz='UTC')])

            if deep is None:
                s2 = s.copy()
            else:
                s2 = s.copy(deep=deep)

            s2[0] = pd.Timestamp('1999/01/01', tz='UTC')

            # default deep is True
            if deep is None or deep is True:
                # Did not modify original Series
                assert_series_equal(s2, expected2)
                assert_series_equal(s, expected)
            else:
                # we DID modify the original Series
                assert_series_equal(s2, expected2)
                assert_series_equal(s, expected2)
Example #21
0
    def test_fillna_inplace(self):
        x = Series([nan, 1.0, nan, 3.0, nan], ["z", "a", "b", "c", "d"])
        y = x.copy()

        y.fillna(value=0, inplace=True)

        expected = x.fillna(value=0)
        assert_series_equal(y, expected)
Example #22
0
def test_drop_duplicates_bool(keep, expected):
    tc = Series([True, False, True, False])

    tm.assert_series_equal(tc.duplicated(keep=keep), expected)
    tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
    sc = tc.copy()
    sc.drop_duplicates(keep=keep, inplace=True)
    tm.assert_series_equal(sc, tc[~expected])
Example #23
0
def test_drop_duplicates_non_bool(any_numpy_dtype, keep, expected):
    tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(any_numpy_dtype))

    tm.assert_series_equal(tc.duplicated(keep=keep), expected)
    tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
    sc = tc.copy()
    sc.drop_duplicates(keep=keep, inplace=True)
    tm.assert_series_equal(sc, tc[~expected])
    def test_fillna_inplace(self):
        x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd'])
        y = x.copy()

        y.fillna(value=0, inplace=True)

        expected = x.fillna(value=0)
        assert_series_equal(y, expected)
Example #25
0
    def test_rank_modify_inplace(self):
        # GH 18521
        # Check rank does not mutate series
        s = Series([Timestamp('2017-01-05 10:20:27.569000'), NaT])
        expected = s.copy()

        s.rank()
        result = s
        assert_series_equal(result, expected)
Example #26
0
def test_setitem_float_labels():
    # note labels are floats
    s = Series(['a', 'b', 'c'], index=[0, 0.5, 1])
    tmp = s.copy()

    s.loc[1] = 'zoo'
    tmp.iloc[2] = 'zoo'

    assert_series_equal(s, tmp)
Example #27
0
    def test_series_nested(self, orient):
        s = Series([10, 20, 30, 40, 50, 60], name="series",
                   index=[6, 7, 8, 9, 10, 15]).sort_values()
        nested = {"s1": s, "s2": s.copy()}
        kwargs = {} if orient is None else dict(orient=orient)

        exp = {"s1": ujson.decode(ujson.encode(s, **kwargs)),
               "s2": ujson.decode(ujson.encode(s, **kwargs))}
        assert ujson.decode(ujson.encode(nested, **kwargs)) == exp
Example #28
0
    def test_operators_datetimelike_invalid(self, all_arithmetic_operators):
        # these are all TypeEror ops
        op_str = all_arithmetic_operators

        def check(get_ser, test_ser):

            # check that we are getting a TypeError
            # with 'operate' (from core/ops.py) for the ops that are not
            # defined
            op = getattr(get_ser, op_str, None)
            with tm.assert_raises_regex(TypeError, 'operate|cannot'):
                op(test_ser)

        # ## timedelta64 ###
        td1 = Series([timedelta(minutes=5, seconds=3)] * 3)
        td1.iloc[2] = np.nan

        # ## datetime64 ###
        dt1 = Series([Timestamp('20111230'), Timestamp('20120101'),
                      Timestamp('20120103')])
        dt1.iloc[2] = np.nan
        dt2 = Series([Timestamp('20111231'), Timestamp('20120102'),
                      Timestamp('20120104')])
        if op_str not in ['__sub__', '__rsub__']:
            check(dt1, dt2)

        # ## datetime64 with timetimedelta ###
        # TODO(jreback) __rsub__ should raise?
        if op_str not in ['__add__', '__radd__', '__sub__']:
            check(dt1, td1)

        # 8260, 10763
        # datetime64 with tz
        tz = 'US/Eastern'
        dt1 = Series(date_range('2000-01-01 09:00:00', periods=5,
                                tz=tz), name='foo')
        dt2 = dt1.copy()
        dt2.iloc[2] = np.nan
        td1 = Series(timedelta_range('1 days 1 min', periods=5, freq='H'))
        td2 = td1.copy()
        td2.iloc[1] = np.nan

        if op_str not in ['__add__', '__radd__', '__sub__', '__rsub__']:
            check(dt2, td2)
Example #29
0
def test_basic_setitem_with_labels(test_data):
    indices = test_data.ts.index[[5, 10, 15]]

    cp = test_data.ts.copy()
    exp = test_data.ts.copy()
    cp[indices] = 0
    exp.loc[indices] = 0
    assert_series_equal(cp, exp)

    cp = test_data.ts.copy()
    exp = test_data.ts.copy()
    cp[indices[0]:indices[2]] = 0
    exp.loc[indices[0]:indices[2]] = 0
    assert_series_equal(cp, exp)

    # integer indexes, be careful
    s = Series(np.random.randn(10), index=lrange(0, 20, 2))
    inds = [0, 4, 6]
    arr_inds = np.array([0, 4, 6])

    cp = s.copy()
    exp = s.copy()
    s[inds] = 0
    s.loc[inds] = 0
    assert_series_equal(cp, exp)

    cp = s.copy()
    exp = s.copy()
    s[arr_inds] = 0
    s.loc[arr_inds] = 0
    assert_series_equal(cp, exp)

    inds_notfound = [0, 4, 5, 6]
    arr_inds_notfound = np.array([0, 4, 5, 6])
    msg = r"\[5\] not contained in the index"
    with pytest.raises(ValueError, match=msg):
        s[inds_notfound] = 0
    with pytest.raises(Exception, match=msg):
        s[arr_inds_notfound] = 0

    # GH12089
    # with tz for values
    s = Series(pd.date_range("2011-01-01", periods=3, tz="US/Eastern"),
               index=['a', 'b', 'c'])
    s2 = s.copy()
    expected = Timestamp('2011-01-03', tz='US/Eastern')
    s2.loc['a'] = expected
    result = s2.loc['a']
    assert result == expected

    s2 = s.copy()
    s2.iloc[0] = expected
    result = s2.iloc[0]
    assert result == expected

    s2 = s.copy()
    s2['a'] = expected
    result = s2['a']
    assert result == expected
Example #30
0
    def test_getitem_setitem_periodindex(self):
        from pandas import period_range

        N = 50
        rng = period_range('1/1/1990', periods=N, freq='H')
        ts = Series(np.random.randn(N), index=rng)

        result = ts["1990-01-01 04"]
        expected = ts[4]
        self.assertEqual(result, expected)

        result = ts.copy()
        result["1990-01-01 04"] = 0
        result["1990-01-01 04"] = ts[4]
        assert_series_equal(result, ts)

        result = ts["1990-01-01 04":"1990-01-01 07"]
        expected = ts[4:8]
        assert_series_equal(result, expected)

        result = ts.copy()
        result["1990-01-01 04":"1990-01-01 07"] = 0
        result["1990-01-01 04":"1990-01-01 07"] = ts[4:8]
        assert_series_equal(result, ts)

        lb = "1990-01-01 04"
        rb = "1990-01-01 07"
        result = ts[(ts.index >= lb) & (ts.index <= rb)]
        expected = ts[4:8]
        assert_series_equal(result, expected)

        # GH 2782
        result = ts[ts.index[4]]
        expected = ts[4]
        self.assertEqual(result, expected)

        result = ts[ts.index[4:8]]
        expected = ts[4:8]
        assert_series_equal(result, expected)

        result = ts.copy()
        result[ts.index[4:8]] = 0
        result[4:8] = ts[4:8]
        assert_series_equal(result, ts)
Example #31
0
    def test_slice_integer(self):

        # same as above, but for Integer based indexes
        # these coerce to a like integer
        # oob indiciates if we are out of bounds
        # of positional indexing
        for index, oob in [(tm.makeIntIndex(5), False),
                           (tm.makeRangeIndex(5), False),
                           (tm.makeIntIndex(5) + 10, True)]:

            # s is an in-range index
            s = Series(range(5), index=index)

            # getitem
            for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]:

                for idxr in [lambda x: x.loc, lambda x: x.ix]:

                    with catch_warnings(record=True):
                        result = idxr(s)[l]

                    # these are all label indexing
                    # except getitem which is positional
                    # empty
                    if oob:
                        indexer = slice(0, 0)
                    else:
                        indexer = slice(3, 5)
                    self.check(result, s, indexer, False)

                # positional indexing
                def f():
                    s[l]

                pytest.raises(TypeError, f)

            # getitem out-of-bounds
            for l in [slice(-6, 6), slice(-6.0, 6.0)]:

                for idxr in [lambda x: x.loc, lambda x: x.ix]:
                    with catch_warnings(record=True):
                        result = idxr(s)[l]

                    # these are all label indexing
                    # except getitem which is positional
                    # empty
                    if oob:
                        indexer = slice(0, 0)
                    else:
                        indexer = slice(-6, 6)
                    self.check(result, s, indexer, False)

            # positional indexing
            def f():
                s[slice(-6.0, 6.0)]

            pytest.raises(TypeError, f)

            # getitem odd floats
            for l, res1 in [(slice(2.5, 4), slice(3, 5)),
                            (slice(2, 3.5), slice(2, 4)),
                            (slice(2.5, 3.5), slice(3, 4))]:

                for idxr in [lambda x: x.loc, lambda x: x.ix]:

                    with catch_warnings(record=True):
                        result = idxr(s)[l]
                    if oob:
                        res = slice(0, 0)
                    else:
                        res = res1

                    self.check(result, s, res, False)

                # positional indexing
                def f():
                    s[l]

                pytest.raises(TypeError, f)

            # setitem
            for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]:

                for idxr in [lambda x: x.loc, lambda x: x.ix]:
                    sc = s.copy()
                    with catch_warnings(record=True):
                        idxr(sc)[l] = 0
                        result = idxr(sc)[l].values.ravel()
                    assert (result == 0).all()

                # positional indexing
                def f():
                    s[l] = 0

                pytest.raises(TypeError, f)
Example #32
0
    def _predict_core(self, s: pd.Series) -> pd.Series:
        if not (s.index.is_monotonic_increasing
                or s.index.is_monotonic_decreasing):
            raise ValueError("Time series must have a monotonic time index. ")

        agg = self.agg
        agg_params = self.agg_params if (self.agg_params is not None) else {}
        window = self.window
        min_periods = self.min_periods
        center = self.center
        diff = self.diff

        if not isinstance(agg, tuple):
            agg = (agg, agg)

        if not isinstance(agg_params, tuple):
            agg_params = (agg_params, agg_params)

        if not isinstance(window, tuple):
            window = (window, window)

        if not isinstance(min_periods, tuple):
            min_periods = (min_periods, min_periods)

        if center:
            if isinstance(window[0], int):
                s_rolling_left = RollingAggregate(
                    agg=agg[0],
                    agg_params=agg_params[0],
                    window=window[0],
                    min_periods=min_periods[0],
                    center=False,
                ).transform(s.shift(1))
            else:
                ra = RollingAggregate(
                    agg=agg[0],
                    agg_params=agg_params[0],
                    window=window[0],
                    min_periods=min_periods[0],
                    center=False,
                )
                if parse(pd.__version__) < parse("0.25"):
                    raise PandasBugError()
                ra._closed = "left"
                s_rolling_left = ra.transform(s)
            if isinstance(window[1], int):
                s_rolling_right = (RollingAggregate(
                    agg=agg[1],
                    agg_params=agg_params[1],
                    window=window[1],
                    min_periods=min_periods[1],
                    center=False,
                ).transform(s.iloc[::-1]).iloc[::-1])
            else:
                s_reversed = pd.Series(
                    s.values[::-1],
                    index=pd.DatetimeIndex([
                        s.index[0] + (s.index[-1] - s.index[i])
                        for i in range(len(s) - 1, -1, -1)
                    ]),
                )
                s_rolling_right = pd.Series(
                    RollingAggregate(
                        agg=agg[1],
                        agg_params=agg_params[1],
                        window=window[1],
                        min_periods=min_periods[1],
                        center=False,
                    ).transform(s_reversed).iloc[::-1].values,
                    index=s.index,
                )
                s_rolling_right.name = s.name
        else:
            if isinstance(window[1], int):
                s_rolling_left = RollingAggregate(
                    agg=agg[0],
                    agg_params=agg_params[0],
                    window=window[0],
                    min_periods=min_periods[0],
                    center=False,
                ).transform(s.shift(window[1]))
            else:
                s_shifted = pd.Series(s.values,
                                      s.index + pd.Timedelta(window[1]))
                s_shifted = s_shifted.append(
                    pd.Series(index=s.index, dtype="float64"))
                s_shifted = s_shifted.iloc[s_shifted.index.duplicated() ==
                                           False]
                s_shifted = s_shifted.sort_index()
                s_shifted.name = s.name
                s_rolling_left = RollingAggregate(
                    agg=agg[0],
                    agg_params=agg_params[0],
                    window=window[0],
                    min_periods=min_periods[0],
                    center=False,
                ).transform(s_shifted)
                if isinstance(s_rolling_left, pd.Series):
                    s_rolling_left = s_rolling_left[s.index]
                else:
                    s_rolling_left = s_rolling_left.loc[s.index, :]
            s_rolling_right = RollingAggregate(
                agg=agg[1],
                agg_params=agg_params[1],
                window=window[1],
                min_periods=min_periods[1],
                center=False,
            ).transform(s)

        if isinstance(s_rolling_left, pd.Series):
            if diff in ["l1", "l2"]:
                return abs(s_rolling_right - s_rolling_left)
            if diff == "diff":
                return s_rolling_right - s_rolling_left
            if diff == "rel_diff":
                return (s_rolling_right - s_rolling_left) / s_rolling_left
            if diff == "abs_rel_diff":
                return abs(s_rolling_right - s_rolling_left) / s_rolling_left

        if isinstance(s_rolling_left, pd.DataFrame):
            if diff == "l1":
                return abs(s_rolling_right - s_rolling_left).sum(axis=1,
                                                                 skipna=False)
            if diff == "l2":
                return ((s_rolling_right - s_rolling_left)**2).sum(
                    axis=1, skipna=False)**0.5

        if callable(diff):
            s_rolling = s.copy()
            for i in range(len(s_rolling)):
                s_rolling.iloc[i] = diff(s_rolling_left.iloc[i],
                                         s_rolling_right.iloc[i])
            return s_rolling

        raise ValueError("Invalid value of diff")
Example #33
0
def pattern_match(  # pylint: disable=too-many-arguments
    meta_col: pd.Series,
    values: Union[Iterable[str], str],
    level: Optional[Union[str, int]] = None,
    regexp: bool = False,
    has_nan: bool = True,
    separator: str = DEFAULT_SEPARATOR,
) -> np.ndarray:
    """
    Filter data by matching metadata columns to given patterns

    Parameters
    ----------
    meta_col
        Column to perform filtering on

    values
        Values to match

    level
        Passed to ``find_depth``. For usage, see docstring of ``find_depth``.

    regexp
        If True, match using regexp rather than pseudo regexp syntax developed by the
        `pyam <https://github.com/IAMconsortium/pyam>`_ developers.

    has_nan
        If True, convert all nan in ``meta_col`` to empty string before applying
        filters. This means that "" and "*" will match rows with ``np.nan``. If False,
        the conversion is not applied and so a search in a string column which
        contains ``np.nan`` will result in a ``TypeError``.

    separator
        String used to separate the hierarchy levels in values. Defaults to '|'

    Returns
    -------
    :obj:`np.array` of :obj:`bool`
        Array where True indicates a match

    Raises
    ------
    TypeError
        Filtering is performed on a string metadata column which contains
        ``np.nan`` and ``has_nan`` is ``False``.
    """
    matches = np.array([False] * len(meta_col))
    _values = [values] if not isinstance(values, Iterable) or is_str(values) else values

    # pyam issue (#40) with string-to-nan comparison, replace nan by empty string
    # TODO: add docs and example of filtering/removing NaN given this internal
    #       conversion
    _meta_col = meta_col.copy()
    if has_nan:
        _meta_col.loc[[np.isnan(i) if not is_str(i) else False for i in _meta_col]] = ""

    for s in _values:
        if is_str(s):
            _regexp = (
                str(s)
                .replace("|", "\\|")
                .replace(".", r"\.")  # `.` has to be replaced before `*`
                .replace("*", ".*")
                .replace("+", r"\+")
                .replace("(", r"\(")
                .replace(")", r"\)")
                .replace("$", "\\$")
            ) + "$"
            pattern = re.compile(_regexp if not regexp else str(s))
            try:
                subset = [m for m in _meta_col if pattern.match(m)]
            except TypeError as e:
                # if it's not the cryptic pandas message we expect, raise
                msg = str(e)
                if msg != "expected string or bytes-like object":
                    raise e  # pragma: no cover # emergency valve

                error_msg = (
                    "String filtering cannot be performed on column '{}', which "
                    "contains NaN's, unless `has_nan` is True".format(_meta_col.name)
                )
                raise TypeError(error_msg)

            depth = (
                True
                if level is None
                else find_depth(_meta_col, str(s), level, separator=separator)
            )
            matches |= _meta_col.isin(subset) & depth
        else:
            matches |= meta_col == s

    return matches
Example #34
0
    def test_partial_setting(self):

        # GH2578, allow ix and friends to partially set

        # series
        s_orig = Series([1, 2, 3])

        s = s_orig.copy()
        s[5] = 5
        expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s.loc[5] = 5
        expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s[5] = 5.0
        expected = Series([1, 2, 3, 5.0], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s.loc[5] = 5.0
        expected = Series([1, 2, 3, 5.0], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        # iloc/iat raise
        s = s_orig.copy()

        with pytest.raises(IndexError):
            s.iloc[3] = 5.0

        with pytest.raises(IndexError):
            s.iat[3] = 5.0

        # ## frame ##

        df_orig = DataFrame(np.arange(6).reshape(3, 2),
                            columns=["A", "B"],
                            dtype="int64")

        # iloc/iat raise
        df = df_orig.copy()

        with pytest.raises(IndexError):
            df.iloc[4, 2] = 5.0

        with pytest.raises(IndexError):
            df.iat[4, 2] = 5.0

        # row setting where it exists
        expected = DataFrame(dict({"A": [0, 4, 4], "B": [1, 5, 5]}))
        df = df_orig.copy()
        df.iloc[1] = df.iloc[2]
        tm.assert_frame_equal(df, expected)

        expected = DataFrame(dict({"A": [0, 4, 4], "B": [1, 5, 5]}))
        df = df_orig.copy()
        df.loc[1] = df.loc[2]
        tm.assert_frame_equal(df, expected)

        # like 2578, partial setting with dtype preservation
        expected = DataFrame(dict({"A": [0, 2, 4, 4], "B": [1, 3, 5, 5]}))
        df = df_orig.copy()
        df.loc[3] = df.loc[2]
        tm.assert_frame_equal(df, expected)

        # single dtype frame, overwrite
        expected = DataFrame(dict({"A": [0, 2, 4], "B": [0, 2, 4]}))
        df = df_orig.copy()
        df.loc[:, "B"] = df.loc[:, "A"]
        tm.assert_frame_equal(df, expected)

        # mixed dtype frame, overwrite
        expected = DataFrame(dict({"A": [0, 2, 4], "B": Series([0, 2, 4])}))
        df = df_orig.copy()
        df["B"] = df["B"].astype(np.float64)
        df.loc[:, "B"] = df.loc[:, "A"]
        tm.assert_frame_equal(df, expected)

        # single dtype frame, partial setting
        expected = df_orig.copy()
        expected["C"] = df["A"]
        df = df_orig.copy()
        df.loc[:, "C"] = df.loc[:, "A"]
        tm.assert_frame_equal(df, expected)

        # mixed frame, partial setting
        expected = df_orig.copy()
        expected["C"] = df["A"]
        df = df_orig.copy()
        df.loc[:, "C"] = df.loc[:, "A"]
        tm.assert_frame_equal(df, expected)

        # GH 8473
        dates = date_range("1/1/2000", periods=8)
        df_orig = DataFrame(np.random.randn(8, 4),
                            index=dates,
                            columns=["A", "B", "C", "D"])

        expected = pd.concat(
            [df_orig,
             DataFrame({"A": 7}, index=[dates[-1] + dates.freq])],
            sort=True)
        df = df_orig.copy()
        df.loc[dates[-1] + dates.freq, "A"] = 7
        tm.assert_frame_equal(df, expected)
        df = df_orig.copy()
        df.at[dates[-1] + dates.freq, "A"] = 7
        tm.assert_frame_equal(df, expected)

        exp_other = DataFrame({0: 7}, index=[dates[-1] + dates.freq])
        expected = pd.concat([df_orig, exp_other], axis=1)

        df = df_orig.copy()
        df.loc[dates[-1] + dates.freq, 0] = 7
        tm.assert_frame_equal(df, expected)
        df = df_orig.copy()
        df.at[dates[-1] + dates.freq, 0] = 7
        tm.assert_frame_equal(df, expected)
Example #35
0
    def test_convert_objects(self):

        s = Series([1., 2, 3], index=['a', 'b', 'c'])
        with tm.assert_produces_warning(FutureWarning):
            result = s.convert_objects(convert_dates=False,
                                       convert_numeric=True)
        assert_series_equal(result, s)

        # force numeric conversion
        r = s.copy().astype('O')
        r['a'] = '1'
        with tm.assert_produces_warning(FutureWarning):
            result = r.convert_objects(convert_dates=False,
                                       convert_numeric=True)
        assert_series_equal(result, s)

        r = s.copy().astype('O')
        r['a'] = '1.'
        with tm.assert_produces_warning(FutureWarning):
            result = r.convert_objects(convert_dates=False,
                                       convert_numeric=True)
        assert_series_equal(result, s)

        r = s.copy().astype('O')
        r['a'] = 'garbled'
        expected = s.copy()
        expected['a'] = np.nan
        with tm.assert_produces_warning(FutureWarning):
            result = r.convert_objects(convert_dates=False,
                                       convert_numeric=True)
        assert_series_equal(result, expected)

        # GH 4119, not converting a mixed type (e.g.floats and object)
        s = Series([1, 'na', 3, 4])
        with tm.assert_produces_warning(FutureWarning):
            result = s.convert_objects(convert_numeric=True)
        expected = Series([1, np.nan, 3, 4])
        assert_series_equal(result, expected)

        s = Series([1, '', 3, 4])
        with tm.assert_produces_warning(FutureWarning):
            result = s.convert_objects(convert_numeric=True)
        expected = Series([1, np.nan, 3, 4])
        assert_series_equal(result, expected)

        # dates
        s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0),
                    datetime(2001, 1, 3, 0, 0)])
        s2 = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0),
                     datetime(2001, 1, 3, 0, 0), 'foo', 1.0, 1,
                     Timestamp('20010104'), '20010105'],
                    dtype='O')
        with tm.assert_produces_warning(FutureWarning):
            result = s.convert_objects(convert_dates=True,
                                       convert_numeric=False)
        expected = Series([Timestamp('20010101'), Timestamp('20010102'),
                           Timestamp('20010103')], dtype='M8[ns]')
        assert_series_equal(result, expected)

        with tm.assert_produces_warning(FutureWarning):
            result = s.convert_objects(convert_dates='coerce',
                                       convert_numeric=False)
        with tm.assert_produces_warning(FutureWarning):
            result = s.convert_objects(convert_dates='coerce',
                                       convert_numeric=True)
        assert_series_equal(result, expected)

        expected = Series([Timestamp('20010101'), Timestamp('20010102'),
                           Timestamp('20010103'),
                           lib.NaT, lib.NaT, lib.NaT, Timestamp('20010104'),
                           Timestamp('20010105')], dtype='M8[ns]')
        with tm.assert_produces_warning(FutureWarning):
            result = s2.convert_objects(convert_dates='coerce',
                                        convert_numeric=False)
        assert_series_equal(result, expected)
        with tm.assert_produces_warning(FutureWarning):
            result = s2.convert_objects(convert_dates='coerce',
                                        convert_numeric=True)
        assert_series_equal(result, expected)

        # preserver all-nans (if convert_dates='coerce')
        s = Series(['foo', 'bar', 1, 1.0], dtype='O')
        with tm.assert_produces_warning(FutureWarning):
            result = s.convert_objects(convert_dates='coerce',
                                       convert_numeric=False)
        expected = Series([lib.NaT] * 2 + [Timestamp(1)] * 2)
        assert_series_equal(result, expected)

        # preserver if non-object
        s = Series([1], dtype='float32')
        with tm.assert_produces_warning(FutureWarning):
            result = s.convert_objects(convert_dates='coerce',
                                       convert_numeric=False)
        assert_series_equal(result, s)

        # r = s.copy()
        # r[0] = np.nan
        # result = r.convert_objects(convert_dates=True,convert_numeric=False)
        # assert result.dtype == 'M8[ns]'

        # dateutil parses some single letters into today's value as a date
        for x in 'abcdefghijklmnopqrstuvwxyz':
            s = Series([x])
            with tm.assert_produces_warning(FutureWarning):
                result = s.convert_objects(convert_dates='coerce')
            assert_series_equal(result, s)
            s = Series([x.upper()])
            with tm.assert_produces_warning(FutureWarning):
                result = s.convert_objects(convert_dates='coerce')
            assert_series_equal(result, s)
Example #36
0
def func(X: pd.DataFrame,
         y: pd.Series,
         selection_times=3,
         title="RON_loss",
         del_abnormal=False,
         abnormal_threshold=0.08):
    y = np.array(y)
    selector = SelectFromModel(estimator=GradientBoostingRegressor(
        random_state=0))
    X_ = QuantileTransformer(n_quantiles=1000).fit_transform(X)
    X_ = pd.DataFrame(X_, columns=X.columns)
    for i in range(selection_times):
        X_d = selector.fit_transform(X_, y)
        X_ = pd.DataFrame(X_d, columns=X_.columns[selector.get_support()])
    X_d = QuantileTransformer(n_quantiles=1000).fit_transform(X[X_.columns])
    # X_d=X[X_.columns].values
    X_ = pd.DataFrame(X_d, columns=X_.columns)
    print(f"{title} | {selection_times}次特征筛选后的X_.shape = {X_.shape}")
    print(f"{title} | 特征筛选后保留的列: {X_.columns.tolist()}")
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    pipeline = LGBMRegressor(random_state=0,
                             n_estimators=100,
                             learning_rate=0.1)
    pipeline.fit(X_, y)
    y_pred = pipeline.predict(X_)
    train_score = r2_score(y, y_pred)
    pearson_correlation = pearsonr(y, y_pred)[0]
    print(
        f"{title} | 在训练集上,r2 = {train_score}, pearson 相关系数 = {pearson_correlation}"
    )
    y_ = y.copy()
    y_pred_ = y_pred.copy()
    if del_abnormal:
        y_pred = pipeline.predict(X_)
        err = np.abs(y - y_pred)
        mask = err > abnormal_threshold
        print(f"{title} | 异常样本数 = {np.count_nonzero(mask)}")
        plt.rcParams['figure.figsize'] = (7, 4.5)
        plt.grid(alpha=0.2)
        plt.scatter(y[mask], y_pred[mask], label="abnormal samples", c="r")
        plt.scatter(y[~mask], y_pred[~mask], label="normal samples", c="b")
        plt.legend(loc="best")
        print(
            f"{title} | 删除异常样本前的表现 = {cross_val_score(pipeline, X_, y, cv=cv).mean()}"
        )
        X_ = X_.loc[~mask, :]
        y = y[~mask]
        print(
            f"{title} | 删除异常样本后的表现 = {cross_val_score(pipeline, X_, y, cv=cv).mean()}"
        )
        plt.title(f"{title} abnormal samples")
        plt.xlabel("y true")
        plt.ylabel("y pred")
        plt.savefig(f"{title}_abnormal.pdf")
        plt.close()
    valid_scores = []
    plt.rcParams['figure.figsize'] = (18, 12)
    for i, (train_ix, valid_ix) in enumerate(cv.split(X_, y)):
        X_train = X_.iloc[train_ix, :].copy()
        X_valid = X_.iloc[valid_ix, :].copy()
        y_train = y[train_ix].copy()
        y_valid = y[valid_ix].copy()
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_valid)
        plt.subplot(2, 3, i + 1)
        sns.regplot(x="y true",
                    y="y pred",
                    data=pd.DataFrame({
                        "y true": y_valid,
                        "y pred": y_pred
                    }))
        plt.title(f"fold-{i + 1}")
        valid_scores.append(r2_score(y_valid, y_pred))
    plt.subplot(2, 3, 6)
    sns.regplot(x="y true",
                y="y pred",
                data=pd.DataFrame({
                    "y true": y_,
                    "y pred": y_pred_
                }))
    plt.title(f"train-set")
    plt.suptitle(f"{title} cross-validation")
    print(f"{title} | 5折交叉验证后,在验证集上的平均r2 = {np.mean(valid_scores)}\n"
          f"{title} | 每折的r2 = {valid_scores}")
    plt.savefig(f"{title}_cross-validation.pdf")
    plt.close()
    X_["label"] = y
    X_.to_csv(f"{title}_data.csv", index=False)
    dump(pipeline, f"{title}_model.bz2")
Example #37
0
 def _subtract_min_from_every_element(series: pd.Series):
     min_value = series.copy().apply(func=lambda x: x.value).min()
     return series.apply(
         func=PandasMunkres._subtract_value_from_object_value,
         args=(min_value, ))
Example #38
0
    def test_partial_setting(self):

        # GH2578, allow ix and friends to partially set

        # series
        s_orig = Series([1, 2, 3])

        s = s_orig.copy()
        s[5] = 5
        expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s.loc[5] = 5
        expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s[5] = 5.
        expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s.loc[5] = 5.
        expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        # iloc/iat raise
        s = s_orig.copy()

        def f():
            s.iloc[3] = 5.

        self.assertRaises(IndexError, f)

        def f():
            s.iat[3] = 5.

        self.assertRaises(IndexError, f)

        # ## frame ##

        df_orig = DataFrame(np.arange(6).reshape(3, 2),
                            columns=['A', 'B'],
                            dtype='int64')

        # iloc/iat raise
        df = df_orig.copy()

        def f():
            df.iloc[4, 2] = 5.

        self.assertRaises(IndexError, f)

        def f():
            df.iat[4, 2] = 5.

        self.assertRaises(IndexError, f)

        # row setting where it exists
        expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]}))
        df = df_orig.copy()
        df.iloc[1] = df.iloc[2]
        tm.assert_frame_equal(df, expected)

        expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]}))
        df = df_orig.copy()
        df.loc[1] = df.loc[2]
        tm.assert_frame_equal(df, expected)

        # like 2578, partial setting with dtype preservation
        expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]}))
        df = df_orig.copy()
        df.loc[3] = df.loc[2]
        tm.assert_frame_equal(df, expected)

        # single dtype frame, overwrite
        expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]}))
        df = df_orig.copy()
        with catch_warnings(record=True):
            df.ix[:, 'B'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # mixed dtype frame, overwrite
        expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])}))
        df = df_orig.copy()
        df['B'] = df['B'].astype(np.float64)
        with catch_warnings(record=True):
            df.ix[:, 'B'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # single dtype frame, partial setting
        expected = df_orig.copy()
        expected['C'] = df['A']
        df = df_orig.copy()
        with catch_warnings(record=True):
            df.ix[:, 'C'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # mixed frame, partial setting
        expected = df_orig.copy()
        expected['C'] = df['A']
        df = df_orig.copy()
        with catch_warnings(record=True):
            df.ix[:, 'C'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        with catch_warnings(record=True):
            # ## panel ##
            p_orig = Panel(np.arange(16).reshape(2, 4, 2),
                           items=['Item1', 'Item2'],
                           major_axis=pd.date_range('2001/1/12', periods=4),
                           minor_axis=['A', 'B'],
                           dtype='float64')

            # panel setting via item
            p_orig = Panel(np.arange(16).reshape(2, 4, 2),
                           items=['Item1', 'Item2'],
                           major_axis=pd.date_range('2001/1/12', periods=4),
                           minor_axis=['A', 'B'],
                           dtype='float64')
            expected = p_orig.copy()
            expected['Item3'] = expected['Item1']
            p = p_orig.copy()
            p.loc['Item3'] = p['Item1']
            tm.assert_panel_equal(p, expected)

            # panel with aligned series
            expected = p_orig.copy()
            expected = expected.transpose(2, 1, 0)
            expected['C'] = DataFrame(
                {
                    'Item1': [30, 30, 30, 30],
                    'Item2': [32, 32, 32, 32]
                },
                index=p_orig.major_axis)
            expected = expected.transpose(2, 1, 0)
            p = p_orig.copy()
            p.loc[:, :, 'C'] = Series([30, 32], index=p_orig.items)
            tm.assert_panel_equal(p, expected)

        # GH 8473
        dates = date_range('1/1/2000', periods=8)
        df_orig = DataFrame(np.random.randn(8, 4),
                            index=dates,
                            columns=['A', 'B', 'C', 'D'])

        expected = pd.concat(
            [df_orig, DataFrame({'A': 7}, index=[dates[-1] + 1])])
        df = df_orig.copy()
        df.loc[dates[-1] + 1, 'A'] = 7
        tm.assert_frame_equal(df, expected)
        df = df_orig.copy()
        df.at[dates[-1] + 1, 'A'] = 7
        tm.assert_frame_equal(df, expected)

        exp_other = DataFrame({0: 7}, index=[dates[-1] + 1])
        expected = pd.concat([df_orig, exp_other], axis=1)

        df = df_orig.copy()
        df.loc[dates[-1] + 1, 0] = 7
        tm.assert_frame_equal(df, expected)
        df = df_orig.copy()
        df.at[dates[-1] + 1, 0] = 7
        tm.assert_frame_equal(df, expected)
Example #39
0
def test_getitem_setitem_datetimeindex():
    N = 50
    # testing with timezone, GH #2785
    rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern')
    ts = Series(np.random.randn(N), index=rng)

    result = ts["1990-01-01 04:00:00"]
    expected = ts[4]
    assert result == expected

    result = ts.copy()
    result["1990-01-01 04:00:00"] = 0
    result["1990-01-01 04:00:00"] = ts[4]
    assert_series_equal(result, ts)

    result = ts["1990-01-01 04:00:00":"1990-01-01 07:00:00"]
    expected = ts[4:8]
    assert_series_equal(result, expected)

    result = ts.copy()
    result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = 0
    result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = ts[4:8]
    assert_series_equal(result, ts)

    lb = "1990-01-01 04:00:00"
    rb = "1990-01-01 07:00:00"
    # GH#18435 strings get a pass from tzawareness compat
    result = ts[(ts.index >= lb) & (ts.index <= rb)]
    expected = ts[4:8]
    assert_series_equal(result, expected)

    lb = "1990-01-01 04:00:00-0500"
    rb = "1990-01-01 07:00:00-0500"
    result = ts[(ts.index >= lb) & (ts.index <= rb)]
    expected = ts[4:8]
    assert_series_equal(result, expected)

    # repeat all the above with naive datetimes
    result = ts[datetime(1990, 1, 1, 4)]
    expected = ts[4]
    assert result == expected

    result = ts.copy()
    result[datetime(1990, 1, 1, 4)] = 0
    result[datetime(1990, 1, 1, 4)] = ts[4]
    assert_series_equal(result, ts)

    result = ts[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)]
    expected = ts[4:8]
    assert_series_equal(result, expected)

    result = ts.copy()
    result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = 0
    result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = ts[4:8]
    assert_series_equal(result, ts)

    lb = datetime(1990, 1, 1, 4)
    rb = datetime(1990, 1, 1, 7)
    msg = "Cannot compare tz-naive and tz-aware datetime-like objects"
    with pytest.raises(TypeError, match=msg):
        # tznaive vs tzaware comparison is invalid
        # see GH#18376, GH#18162
        ts[(ts.index >= lb) & (ts.index <= rb)]

    lb = pd.Timestamp(datetime(1990, 1, 1, 4)).tz_localize(rng.tzinfo)
    rb = pd.Timestamp(datetime(1990, 1, 1, 7)).tz_localize(rng.tzinfo)
    result = ts[(ts.index >= lb) & (ts.index <= rb)]
    expected = ts[4:8]
    assert_series_equal(result, expected)

    result = ts[ts.index[4]]
    expected = ts[4]
    assert result == expected

    result = ts[ts.index[4:8]]
    expected = ts[4:8]
    assert_series_equal(result, expected)

    result = ts.copy()
    result[ts.index[4:8]] = 0
    result[4:8] = ts[4:8]
    assert_series_equal(result, ts)

    # also test partial date slicing
    result = ts["1990-01-02"]
    expected = ts[24:48]
    assert_series_equal(result, expected)

    result = ts.copy()
    result["1990-01-02"] = 0
    result["1990-01-02"] = ts[24:48]
    assert_series_equal(result, ts)
Example #40
0
def QA_fetch_get_factor_groupby(factor: pd.Series,
                                industry_cls: str = "sw_l1",
                                detailed: bool = False) -> pd.DataFrame:
    """
    获取因子的行业暴露, 注意,返回的值是 pd.DataFrame 格式,包含原因子值,附加一列
    因子对应的行业信息 (需先自行导入聚宽本地 sdk 并登陆)

    参数
    ---
    :param factor: 因子值,索引为 ['日期' '资产']
    :param industry_cls: 行业分类,默认为申万 1 级行业
    :param detailed: 是否使用详细模式,默认为 False, 即取因子日期最后一日的行业信息

    返回值
    ---
    :return: 因子数据, 包括因子值,因子对应行业
    """
    warnings.warn("请先自行导入聚宽本地 sdk 并登陆", UserWarning)
    # 因子格式化
    factor = QA_fmt_factor(factor)
    merged_data = pd.DataFrame(factor.copy().rename("factor"))
    # 股票代码格式化
    stock_list = QA_fmt_code_list(
        factor.index.get_level_values("code").drop_duplicates(), )
    # 非详细模式, 行业数据采用当前日期
    ss = pd.Series()
    if detailed:
        # start_time = str(min(factor.index.get_level_values("datetime")))[:10]
        # end_time = str(max(factor.index.get_level_values("datetime")))[:10]
        # date_range = list(
        #     map(pd.Timestamp, QA_util_get_trade_range(start_time, end_time))
        # )
        date_range = (factor.index.get_level_values(
            "datetime").drop_duplicates().tolist())

        industry = pd.DataFrame()
        for cursor_date in date_range:
            df_tmp = QA_fetch_industry_adv(code=stock_list,
                                           cursor_date=cursor_date)[[
                                               "code", "industry_name"
                                           ]]
            df_tmp["date"] = cursor_date
            industry = industry.append(df_tmp)
        ss = industry.set_index(["date", "code"])["industry_name"]
        # industries = map(
        #     partial(jqdatasdk.get_industry,
        #             stock_list),
        #     date_range
        # )
        # industries = {
        #     d: {
        #         s: ind.get(s).get(industry_cls,
        #                           dict()).get("industry_name",
        #                                       "NA")
        #         for s in stock_list
        #     }
        #     for d,
        #     ind in zip(date_range,
        #                industries)
        # }
    else:
        end_time = str(max(factor.index.get_level_values("datetime")))[:10]
        date_range = [pd.Timestamp(end_time)]
        # industries = jqdatasdk.get_industry(stock_list, end_time)
        ss = QA_fetch_industry_adv(stock_list,
                                   end_time)[["code", "industry_name"
                                              ]].set_index(["date", "code"
                                                            ])["industry_name"]
        # industries = {
        #     d: {
        #         s: industries.get(s).get(industry_cls,
        #                                  dict()).get("industry_name",
        #                                              "NA")
        #         for s in stock_list
        #     }
        #     for d in date_range
        # }
    # 可能历史上股票没有行业信息,用之后的行业信息往前填充
    merged_data["date"] = merged_data.index.get_level_values("datetime").map(
        lambda x: x.date())
    merged_data = (merged_data.reset_index().set_index([
        "date", "code"
    ]).assign(group=ss).reset_index().set_index(["datetime",
                                                 "code"]).drop("date", axis=1))
    group = merged_data["group"].unstack().bfill().stack()
    merged_data["group"] = group
    return merged_data
Example #41
0
def QA_fetch_factor_weight(factor: pd.Series,
                           weight_cls: str = "mktcap",
                           detailed: bool = True) -> pd.DataFrame:
    """
    获取因子的市值暴露, 注意,返回的值是 pd.DataFrame 格式,包含原因子值,附加一列
    因子对应的加权信息

    参数
    ---
    :param factor: 因子值,索引为 ['日期' '资产']
    :param weight_cls: 权重信息,默认加权方式为总市值加权
    :param detailed: 默认为 True, 如果为 False, 取因子最后一日的加权信息

    返回值
    ---
    :return: 因子数据, 包括因子值,因子对应行业
    """
    # 因子格式化
    factor = QA_fmt_factor(factor)
    merged_data = pd.DataFrame(factor.copy().rename("factor"))

    # 股票代码格式化
    code_list = factor.index.get_level_values(
        "code").drop_duplicates().tolist()
    # 非详细模式, 加权数据采用当前日期
    if detailed:
        # start_time = str(min(factor.index.get_level_values("datetime")))[:10]
        end_time = str(max(factor.index.get_level_values("datetime")))[:10]
        # date_range = list(
        #     map(pd.Timestamp, QA_util_get_trade_range(start_time, end_time))
        # )
        date_range = (factor.index.get_level_values(
            "datetime").drop_duplicates().tolist())

    else:
        date_range = [pd.Timestamp(end_time)]

    if weight_cls == "avg":
        merged_data["weight"] = 1.0
        return merged_data

    df_local = QAAnalysis_block(code=code_list,
                                start=date_range[0],
                                end=date_range[-1]).market_value
    if weight_cls == "mktcap":
        df_local = df_local.reset_index().pivot(index="date",
                                                columns="code",
                                                values="mv")
    elif weight_cls == "sqrt_mktcap":
        df_local = (df_local.reset_index().pivot(
            index="date", columns="code", values="mv").transform("sqrt"))
    elif weight_cls == "ln_mktcap":
        df_local = (df_local.reset_index().pivot(index="date",
                                                 columns="code",
                                                 values="mv").transform("ln"))
    elif weight_cls == "cmktcap":
        df_local = df_local.reset_index().pivot(index="date",
                                                columns="code",
                                                values="liquidity_mv")
    elif weight_cls == "sqrt_cmktcap":
        df_local = (df_local.reset_index().pivot(
            index="date", columns="code",
            values="liquidity_mv").transform("sqrt"))
    elif weight_cls == "ln_cmktcap":
        df_local = (df_local.reset_index().pivot(
            index="date", columns="code",
            values="liquidity_mv").transform("ln"))
    else:
        raise ValueError(f"{weight_cls} 加权方式未实现")
    merged_data["date"] = merged_data.index.get_level_values("datetime").map(
        lambda x: x.date())
    merged_data = (merged_data.reset_index().set_index(
        ["date",
         "code"]).assign(weight=df_local.stack()).reset_index().set_index(
             ["datetime", "code"]).drop("date", axis=1))

    weight = merged_data["weight"].unstack().bfill().stack()
    merged_data["weight"] = weight
    return merged_data
Example #42
0
frame3 = DataFrame(pop)
frame3.index.name = 'year'
frame3.columns.name = 'state'

frameTest = DataFrame(frame3)
frameTest = DataFrame(frame3, index=[1, 2, 3])

# pandas Index object

obj1 = Series(range(3), index=['a', 'b', 'c'])
index = obj1.index
index = pd.Index(range(3))
obj2 = Series([1.5, -2.5, 0], index=index)
#obj2 is obj2
#obj2.index is index
obj3 = obj2.copy()
# obj3 is obj2 # evaluates to FALSE - so is compares pointers and not the actual data.

# Some python functions.

obj1 = Series([4.5, 7.2, -5.3, 3.7], index=['d', 'b', 'a', 'c'])
obj2 = obj1.reindex(['a', 'b', 'c', 'd', 'e'])
obj2 = obj1.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

frame = DataFrame(np.arange(9).reshape((3, 3)),
                  index=['a', 'c', 'd'],
                  columns=['Ohio', 'Texas', 'California'])
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
states = ['Texas', 'Utah', 'California']
frame3 = frame.reindex(columns=states)
Example #43
0
    def _lagged_values(X: pd.Series, p: int, ar_coef: list):
        """Helper Function to Calculate AutoRegressive(AR) Component"""

        return X if p == 0 else pd.concat([X.copy().shift(periods=i) for i in range(1, p + 1)], axis=1).dot(ar_coef)
Example #44
0
def test_getitem_setitem_datetimeindex():
    N = 50
    # testing with timezone, GH #2785
    rng = date_range("1/1/1990", periods=N, freq="H", tz="US/Eastern")
    ts = Series(np.random.randn(N), index=rng)

    result = ts["1990-01-01 04:00:00"]
    expected = ts[4]
    assert result == expected

    result = ts.copy()
    result["1990-01-01 04:00:00"] = 0
    result["1990-01-01 04:00:00"] = ts[4]
    tm.assert_series_equal(result, ts)

    result = ts["1990-01-01 04:00:00":"1990-01-01 07:00:00"]
    expected = ts[4:8]
    tm.assert_series_equal(result, expected)

    result = ts.copy()
    result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = 0
    result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = ts[4:8]
    tm.assert_series_equal(result, ts)

    lb = "1990-01-01 04:00:00"
    rb = "1990-01-01 07:00:00"
    # GH#18435 strings get a pass from tzawareness compat
    result = ts[(ts.index >= lb) & (ts.index <= rb)]
    expected = ts[4:8]
    tm.assert_series_equal(result, expected)

    lb = "1990-01-01 04:00:00-0500"
    rb = "1990-01-01 07:00:00-0500"
    result = ts[(ts.index >= lb) & (ts.index <= rb)]
    expected = ts[4:8]
    tm.assert_series_equal(result, expected)

    # But we do not give datetimes a pass on tzawareness compat
    # TODO: do the same with Timestamps and dt64
    msg = "Cannot compare tz-naive and tz-aware datetime-like objects"
    naive = datetime(1990, 1, 1, 4)
    with tm.assert_produces_warning(FutureWarning):
        # GH#36148 will require tzawareness compat
        result = ts[naive]
    expected = ts[4]
    assert result == expected

    result = ts.copy()
    with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
        # GH#36148 will require tzawareness compat
        result[datetime(1990, 1, 1, 4)] = 0
    with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
        # GH#36148 will require tzawareness compat
        result[datetime(1990, 1, 1, 4)] = ts[4]
    tm.assert_series_equal(result, ts)

    with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
        # GH#36148 will require tzawareness compat
        result = ts[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)]
    expected = ts[4:8]
    tm.assert_series_equal(result, expected)

    result = ts.copy()
    with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
        # GH#36148 will require tzawareness compat
        result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = 0
    with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
        # GH#36148 will require tzawareness compat
        result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = ts[4:8]
    tm.assert_series_equal(result, ts)

    lb = datetime(1990, 1, 1, 4)
    rb = datetime(1990, 1, 1, 7)
    msg = r"Invalid comparison between dtype=datetime64\[ns, US/Eastern\] and datetime"
    with pytest.raises(TypeError, match=msg):
        # tznaive vs tzaware comparison is invalid
        # see GH#18376, GH#18162
        ts[(ts.index >= lb) & (ts.index <= rb)]

    lb = Timestamp(datetime(1990, 1, 1, 4)).tz_localize(rng.tzinfo)
    rb = Timestamp(datetime(1990, 1, 1, 7)).tz_localize(rng.tzinfo)
    result = ts[(ts.index >= lb) & (ts.index <= rb)]
    expected = ts[4:8]
    tm.assert_series_equal(result, expected)

    result = ts[ts.index[4]]
    expected = ts[4]
    assert result == expected

    result = ts[ts.index[4:8]]
    expected = ts[4:8]
    tm.assert_series_equal(result, expected)

    result = ts.copy()
    result[ts.index[4:8]] = 0
    result.iloc[4:8] = ts.iloc[4:8]
    tm.assert_series_equal(result, ts)

    # also test partial date slicing
    result = ts["1990-01-02"]
    expected = ts[24:48]
    tm.assert_series_equal(result, expected)

    result = ts.copy()
    result["1990-01-02"] = 0
    result["1990-01-02"] = ts[24:48]
    tm.assert_series_equal(result, ts)
Example #45
0
    def test_equals(self):
        s1 = pd.Series([1, 2, 3], index=[0, 2, 1])
        s2 = s1.copy()
        self.assertTrue(s1.equals(s2))

        s1[1] = 99
        self.assertFalse(s1.equals(s2))

        # NaNs compare as equal
        s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3])
        s2 = s1.copy()
        self.assertTrue(s1.equals(s2))

        s2[0] = 9.9
        self.assertFalse(s1.equals(s2))

        idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')])
        s1 = Series([1, 2, np.nan], index=idx)
        s2 = s1.copy()
        self.assertTrue(s1.equals(s2))

        # Add object dtype column with nans
        index = np.random.random(10)
        df1 = DataFrame(np.random.random(10, ),
                        index=index,
                        columns=['floats'])
        df1['text'] = 'the sky is so blue. we could use more chocolate.'.split(
        )
        df1['start'] = date_range('2000-1-1', periods=10, freq='T')
        df1['end'] = date_range('2000-1-1', periods=10, freq='D')
        df1['diff'] = df1['end'] - df1['start']
        df1['bool'] = (np.arange(10) % 3 == 0)
        df1.ix[::2] = nan
        df2 = df1.copy()
        self.assertTrue(df1['text'].equals(df2['text']))
        self.assertTrue(df1['start'].equals(df2['start']))
        self.assertTrue(df1['end'].equals(df2['end']))
        self.assertTrue(df1['diff'].equals(df2['diff']))
        self.assertTrue(df1['bool'].equals(df2['bool']))
        self.assertTrue(df1.equals(df2))
        self.assertFalse(df1.equals(object))

        # different dtype
        different = df1.copy()
        different['floats'] = different['floats'].astype('float32')
        self.assertFalse(df1.equals(different))

        # different index
        different_index = -index
        different = df2.set_index(different_index)
        self.assertFalse(df1.equals(different))

        # different columns
        different = df2.copy()
        different.columns = df2.columns[::-1]
        self.assertFalse(df1.equals(different))

        # DatetimeIndex
        index = pd.date_range('2000-1-1', periods=10, freq='T')
        df1 = df1.set_index(index)
        df2 = df1.copy()
        self.assertTrue(df1.equals(df2))

        # MultiIndex
        df3 = df1.set_index(['text'], append=True)
        df2 = df1.set_index(['text'], append=True)
        self.assertTrue(df3.equals(df2))

        df2 = df1.set_index(['floats'], append=True)
        self.assertFalse(df3.equals(df2))

        # NaN in index
        df3 = df1.set_index(['floats'], append=True)
        df2 = df1.set_index(['floats'], append=True)
        self.assertTrue(df3.equals(df2))
Example #46
0
def test_cython_transform_frame(op, args, targop):
    s = Series(np.random.randn(1000))
    s_missing = s.copy()
    s_missing.iloc[2:10] = np.nan
    labels = np.random.randint(0, 50, size=1000).astype(float)
    strings = list("qwertyuiopasdfghjklz")
    strings_missing = strings[:]
    strings_missing[5] = np.nan
    df = DataFrame(
        {
            "float": s,
            "float_missing": s_missing,
            "int": [1, 1, 1, 1, 2] * 200,
            "datetime": pd.date_range("1990-1-1", periods=1000),
            "timedelta": pd.timedelta_range(1, freq="s", periods=1000),
            "string": strings * 50,
            "string_missing": strings_missing * 50,
        },
        columns=[
            "float",
            "float_missing",
            "int",
            "datetime",
            "timedelta",
            "string",
            "string_missing",
        ],
    )
    df["cat"] = df["string"].astype("category")

    df2 = df.copy()
    df2.index = pd.MultiIndex.from_product([range(100), range(10)])

    # DataFrame - Single and MultiIndex,
    # group by values, index level, columns
    for df in [df, df2]:
        for gb_target in [
                dict(by=labels),
                dict(level=0),
                dict(by="string"),
        ]:  # dict(by='string_missing')]:
            # dict(by=['int','string'])]:

            gb = df.groupby(**gb_target)
            # whitelisted methods set the selection before applying
            # bit a of hack to make sure the cythonized shift
            # is equivalent to pre 0.17.1 behavior
            if op == "shift":
                gb._set_group_selection()

            if op != "shift" and "int" not in gb_target:
                # numeric apply fastpath promotes dtype so have
                # to apply separately and concat
                i = gb[["int"]].apply(targop)
                f = gb[["float", "float_missing"]].apply(targop)
                expected = pd.concat([f, i], axis=1)
            else:
                expected = gb.apply(targop)

            expected = expected.sort_index(axis=1)
            tm.assert_frame_equal(expected,
                                  gb.transform(op, *args).sort_index(axis=1))
            tm.assert_frame_equal(expected,
                                  getattr(gb, op)(*args).sort_index(axis=1))
            # individual columns
            for c in df:
                if c not in ["float", "int", "float_missing"
                             ] and op != "shift":
                    msg = "No numeric types to aggregate"
                    with pytest.raises(DataError, match=msg):
                        gb[c].transform(op)
                    with pytest.raises(DataError, match=msg):
                        getattr(gb[c], op)()
                else:
                    expected = gb[c].apply(targop)
                    expected.name = c
                    tm.assert_series_equal(expected,
                                           gb[c].transform(op, *args))
                    tm.assert_series_equal(expected, getattr(gb[c], op)(*args))
Example #47
0
    def test_convert(self):
        # Tests: All to nans, coerce, true
        # Test coercion returns correct type
        s = Series(['a', 'b', 'c'])
        results = s._convert(datetime=True, coerce=True)
        expected = Series([lib.NaT] * 3)
        assert_series_equal(results, expected)

        results = s._convert(numeric=True, coerce=True)
        expected = Series([np.nan] * 3)
        assert_series_equal(results, expected)

        expected = Series([lib.NaT] * 3, dtype=np.dtype('m8[ns]'))
        results = s._convert(timedelta=True, coerce=True)
        assert_series_equal(results, expected)

        dt = datetime(2001, 1, 1, 0, 0)
        td = dt - datetime(2000, 1, 1, 0, 0)

        # Test coercion with mixed types
        s = Series(['a', '3.1415', dt, td])
        results = s._convert(datetime=True, coerce=True)
        expected = Series([lib.NaT, lib.NaT, dt, lib.NaT])
        assert_series_equal(results, expected)

        results = s._convert(numeric=True, coerce=True)
        expected = Series([nan, 3.1415, nan, nan])
        assert_series_equal(results, expected)

        results = s._convert(timedelta=True, coerce=True)
        expected = Series([lib.NaT, lib.NaT, lib.NaT, td],
                          dtype=np.dtype('m8[ns]'))
        assert_series_equal(results, expected)

        # Test standard conversion returns original
        results = s._convert(datetime=True)
        assert_series_equal(results, s)
        results = s._convert(numeric=True)
        expected = Series([nan, 3.1415, nan, nan])
        assert_series_equal(results, expected)
        results = s._convert(timedelta=True)
        assert_series_equal(results, s)

        # test pass-through and non-conversion when other types selected
        s = Series(['1.0', '2.0', '3.0'])
        results = s._convert(datetime=True, numeric=True, timedelta=True)
        expected = Series([1.0, 2.0, 3.0])
        assert_series_equal(results, expected)
        results = s._convert(True, False, True)
        assert_series_equal(results, s)

        s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)],
                   dtype='O')
        results = s._convert(datetime=True, numeric=True, timedelta=True)
        expected = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0,
                                                                0)])
        assert_series_equal(results, expected)
        results = s._convert(datetime=False, numeric=True, timedelta=True)
        assert_series_equal(results, s)

        td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0)
        s = Series([td, td], dtype='O')
        results = s._convert(datetime=True, numeric=True, timedelta=True)
        expected = Series([td, td])
        assert_series_equal(results, expected)
        results = s._convert(True, True, False)
        assert_series_equal(results, s)

        s = Series([1., 2, 3], index=['a', 'b', 'c'])
        result = s._convert(numeric=True)
        assert_series_equal(result, s)

        # force numeric conversion
        r = s.copy().astype('O')
        r['a'] = '1'
        result = r._convert(numeric=True)
        assert_series_equal(result, s)

        r = s.copy().astype('O')
        r['a'] = '1.'
        result = r._convert(numeric=True)
        assert_series_equal(result, s)

        r = s.copy().astype('O')
        r['a'] = 'garbled'
        result = r._convert(numeric=True)
        expected = s.copy()
        expected['a'] = nan
        assert_series_equal(result, expected)

        # GH 4119, not converting a mixed type (e.g.floats and object)
        s = Series([1, 'na', 3, 4])
        result = s._convert(datetime=True, numeric=True)
        expected = Series([1, nan, 3, 4])
        assert_series_equal(result, expected)

        s = Series([1, '', 3, 4])
        result = s._convert(datetime=True, numeric=True)
        assert_series_equal(result, expected)

        # dates
        s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0),
                    datetime(2001, 1, 3, 0, 0)])
        s2 = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0),
                     datetime(2001, 1, 3, 0, 0), 'foo', 1.0, 1,
                     Timestamp('20010104'), '20010105'], dtype='O')

        result = s._convert(datetime=True)
        expected = Series([Timestamp('20010101'), Timestamp('20010102'),
                           Timestamp('20010103')], dtype='M8[ns]')
        assert_series_equal(result, expected)

        result = s._convert(datetime=True, coerce=True)
        assert_series_equal(result, expected)

        expected = Series([Timestamp('20010101'), Timestamp('20010102'),
                           Timestamp('20010103'), lib.NaT, lib.NaT, lib.NaT,
                           Timestamp('20010104'), Timestamp('20010105')],
                          dtype='M8[ns]')
        result = s2._convert(datetime=True, numeric=False, timedelta=False,
                             coerce=True)
        assert_series_equal(result, expected)
        result = s2._convert(datetime=True, coerce=True)
        assert_series_equal(result, expected)

        s = Series(['foo', 'bar', 1, 1.0], dtype='O')
        result = s._convert(datetime=True, coerce=True)
        expected = Series([lib.NaT] * 2 + [Timestamp(1)] * 2)
        assert_series_equal(result, expected)

        # preserver if non-object
        s = Series([1], dtype='float32')
        result = s._convert(datetime=True, coerce=True)
        assert_series_equal(result, s)

        # r = s.copy()
        # r[0] = np.nan
        # result = r._convert(convert_dates=True,convert_numeric=False)
        # assert result.dtype == 'M8[ns]'

        # dateutil parses some single letters into today's value as a date
        expected = Series([lib.NaT])
        for x in 'abcdefghijklmnopqrstuvwxyz':
            s = Series([x])
            result = s._convert(datetime=True, coerce=True)
            assert_series_equal(result, expected)
            s = Series([x.upper()])
            result = s._convert(datetime=True, coerce=True)
            assert_series_equal(result, expected)
Example #48
0
    def test_cython_transform(self):
        # GH 4095
        ops = [(('cumprod', ()), lambda x: x.cumprod()),
               (('cumsum', ()), lambda x: x.cumsum()),
               (('shift', (-1, )), lambda x: x.shift(-1)),
               (('shift', (1, )), lambda x: x.shift())]

        s = Series(np.random.randn(1000))
        s_missing = s.copy()
        s_missing.iloc[2:10] = np.nan
        labels = np.random.randint(0, 50, size=1000).astype(float)

        # series
        for (op, args), targop in ops:
            for data in [s, s_missing]:
                # print(data.head())
                expected = data.groupby(labels).transform(targop)

                tm.assert_series_equal(
                    expected,
                    data.groupby(labels).transform(op, *args))
                tm.assert_series_equal(
                    expected,
                    getattr(data.groupby(labels), op)(*args))

        strings = list('qwertyuiopasdfghjklz')
        strings_missing = strings[:]
        strings_missing[5] = np.nan
        df = DataFrame({
            'float':
            s,
            'float_missing':
            s_missing,
            'int': [1, 1, 1, 1, 2] * 200,
            'datetime':
            pd.date_range('1990-1-1', periods=1000),
            'timedelta':
            pd.timedelta_range(1, freq='s', periods=1000),
            'string':
            strings * 50,
            'string_missing':
            strings_missing * 50
        })
        df['cat'] = df['string'].astype('category')

        df2 = df.copy()
        df2.index = pd.MultiIndex.from_product([range(100), range(10)])

        # DataFrame - Single and MultiIndex,
        # group by values, index level, columns
        for df in [df, df2]:
            for gb_target in [
                    dict(by=labels),
                    dict(level=0),
                    dict(by='string')
            ]:  # dict(by='string_missing')]:
                # dict(by=['int','string'])]:

                gb = df.groupby(**gb_target)
                # whitelisted methods set the selection before applying
                # bit a of hack to make sure the cythonized shift
                # is equivalent to pre 0.17.1 behavior
                if op == 'shift':
                    gb._set_group_selection()

                for (op, args), targop in ops:
                    if op != 'shift' and 'int' not in gb_target:
                        # numeric apply fastpath promotes dtype so have
                        # to apply seperately and concat
                        i = gb[['int']].apply(targop)
                        f = gb[['float', 'float_missing']].apply(targop)
                        expected = pd.concat([f, i], axis=1)
                    else:
                        expected = gb.apply(targop)

                    expected = expected.sort_index(axis=1)
                    tm.assert_frame_equal(
                        expected,
                        gb.transform(op, *args).sort_index(axis=1))
                    tm.assert_frame_equal(expected, getattr(gb, op)(*args))
                    # individual columns
                    for c in df:
                        if c not in ['float', 'int', 'float_missing'
                                     ] and op != 'shift':
                            pytest.raises(DataError, gb[c].transform, op)
                            pytest.raises(DataError, getattr(gb[c], op))
                        else:
                            expected = gb[c].apply(targop)
                            expected.name = c
                            tm.assert_series_equal(expected,
                                                   gb[c].transform(op, *args))
                            tm.assert_series_equal(expected,
                                                   getattr(gb[c], op)(*args))
def predict_testcounts(
    testcounts: pandas.Series,
    *,
    country: str,
    region: typing.Optional[typing.Union[str, typing.List[str]]],
    regional_holidays: bool = False,
    keep_data: bool,
    ignore_before: typing.Optional[typing.Union[datetime.datetime,
                                                pandas.Timestamp, str]] = None,
    **kwargs,
) -> ForecastingResult:
    """ Predict/smooth missing test counts with Prophet.

    Implemented by Laura Helleckes and Michael Osthege.

    Parameters
    ----------
    testcounts : pandas.Series
        date-indexed series of observed testcounts
    country : str
        name or short code of country (as used by https://github.com/dr-prodigy/python-holidays)
    region : optional, [str]
        if None or []: only nation-wide
        if "all": nation-wide and all regions
        if "CA": nation-wide and those for region "CA"
        if ["CA", "NY", "FL"]: nation-wide and those for all listed regions
    regional_holidays: bool, default False
        if True, fetch regional holidays for each region, if `region` is not set to None or to only
        one region.
        if False (default), fetch only national holidays (useful for countries where test data is
        available at the regional-level, but which only have national holidays).
    keep_data : bool
        if True, existing entries are kept
        if False, existing entries are also predicted, resulting in a smoothed profile
    ignore_before : timestamp
        all dates before this are ignored
        Use this argument to prevent an unrealistic upwards trend due to initial testing ramp-up
    **kwargs
        optional kwargs for the `fbprophet.Prophet`. For example:
        * growth: 'linear' or 'logistic' (default)
        * seasonality_mode: 'additive' or 'multiplicative' (default)

    Returns
    -------
    result : pandas.Series
        the date-indexed series of smoothed/predicted testcounts
    m : fbprophet.Prophet
        the phophet model
    forecast : pandas.DataFrame
        contains the model prediction
    holidays : dict of { datetime : str }
        dictionary of the holidays that were used in the model
    """
    testcounts.index.name = "date"
    testcounts.name = "total"
    if not ignore_before:
        ignore_before = testcounts.index[0]

    # for safety, sort the index
    testcounts.sort_index(inplace=True)

    mask_fit = testcounts.index >= ignore_before
    if keep_data:
        mask_predict = numpy.logical_and(testcounts.index >= ignore_before,
                                         numpy.isnan(testcounts.values))
    else:
        mask_predict = testcounts.index >= ignore_before

    years = set([testcounts.index[0].year, testcounts.index[-1].year])
    regions = numpy.atleast_1d(region)

    if region != "all" and len(regions) <= 1 and regional_holidays:
        raise ValueError(
            "Predicting test counts only at national level or for one region only. "
            "Can't ask for regional holiday. Set `regional_holidays` kwarg to False."
        )
    # need last condition because some countries only national holidays for all regions:
    if (region == "all" or len(regions) > 1) and regional_holidays:
        # distinguish between national and regional holidays
        all_holidays = get_holidays(country, region, years=years)
        national_holidays = get_holidays(country, region=None, years=years)

        holiday_df = pandas.DataFrame(
            data=[(
                date,
                name,
                "national" if date in national_holidays.keys() else "regional",
            ) for date, name in all_holidays.items()],
            columns=["ds", "name", "holiday"],
        )
    else:
        # none, or only one region -> no distinction between national/regional holidays
        all_holidays = get_holidays(country, region=None, years=years)
        holiday_df = pandas.DataFrame(
            dict(
                holiday="holiday",
                name=list(all_holidays.values()),
                ds=pandas.to_datetime(list(all_holidays.keys())),
            ))

    # Config settings of forecast model
    days = (testcounts.index[-1] - testcounts.index[0]).days
    prophet_kwargs = dict(
        growth="logistic",
        seasonality_mode="multiplicative",
        daily_seasonality=False,
        weekly_seasonality=True,
        yearly_seasonality=False,
        holidays=holiday_df,
        mcmc_samples=500,
        # restrict number of potential changepoints:
        n_changepoints=int(numpy.ceil(days / 30)),
    )
    # override defaults with user-specified kwargs
    prophet_kwargs.update(kwargs)
    m = fbprophet.Prophet(**prophet_kwargs)

    # fit only the selected subset of the data
    df_fit = (testcounts.loc[mask_fit].reset_index().rename(columns={
        "date": "ds",
        "total": "y"
    }))

    if prophet_kwargs["growth"] == "logistic":
        cap = numpy.max(testcounts) * 1
        df_fit["floor"] = 0
        df_fit["cap"] = cap
    m.fit(df_fit)

    # predict for all dates in the input
    df_predict = testcounts.reset_index().rename(columns={"date": "ds"})
    if prophet_kwargs["growth"] == "logistic":
        df_predict["floor"] = 0
        df_predict["cap"] = cap
    forecast = m.predict(df_predict)

    # make a series of the result that has the same index as the input
    result = pandas.Series(index=testcounts.index,
                           data=testcounts.copy().values,
                           name="testcount")
    result.loc[mask_predict] = numpy.clip(
        forecast.set_index("ds").yhat, 0, forecast.yhat.max())
    # full-length result series, model and forecast are returned
    return result, m, forecast, all_holidays
Example #50
0
    def test_logical_ops_label_based(self):
        # GH#4947
        # logical ops should be label based

        a = Series([True, False, True], list("bca"))
        b = Series([False, True, False], list("abc"))

        expected = Series([False, True, False], list("abc"))
        result = a & b
        tm.assert_series_equal(result, expected)

        expected = Series([True, True, False], list("abc"))
        result = a | b
        tm.assert_series_equal(result, expected)

        expected = Series([True, False, False], list("abc"))
        result = a ^ b
        tm.assert_series_equal(result, expected)

        # rhs is bigger
        a = Series([True, False, True], list("bca"))
        b = Series([False, True, False, True], list("abcd"))

        expected = Series([False, True, False, False], list("abcd"))
        result = a & b
        tm.assert_series_equal(result, expected)

        expected = Series([True, True, False, False], list("abcd"))
        result = a | b
        tm.assert_series_equal(result, expected)

        # filling

        # vs empty
        empty = Series([], dtype=object)

        result = a & empty.copy()
        expected = Series([False, False, False], list("bca"))
        tm.assert_series_equal(result, expected)

        result = a | empty.copy()
        expected = Series([True, False, True], list("bca"))
        tm.assert_series_equal(result, expected)

        # vs non-matching
        result = a & Series([1], ["z"])
        expected = Series([False, False, False, False], list("abcz"))
        tm.assert_series_equal(result, expected)

        result = a | Series([1], ["z"])
        expected = Series([True, True, False, False], list("abcz"))
        tm.assert_series_equal(result, expected)

        # identity
        # we would like s[s|e] == s to hold for any e, whether empty or not
        for e in [
                empty.copy(),
                Series([1], ["z"]),
                Series(np.nan, b.index),
                Series(np.nan, a.index),
        ]:
            result = a[a | e]
            tm.assert_series_equal(result, a[a])

        for e in [Series(["z"])]:
            result = a[a | e]
            tm.assert_series_equal(result, a[a])

        # vs scalars
        index = list("bca")
        t = Series([True, False, True])

        for v in [True, 1, 2]:
            result = Series([True, False, True], index=index) | v
            expected = Series([True, True, True], index=index)
            tm.assert_series_equal(result, expected)

        msg = "Cannot perform.+with a dtyped.+array and scalar of type"
        for v in [np.nan, "foo"]:
            with pytest.raises(TypeError, match=msg):
                t | v

        for v in [False, 0]:
            result = Series([True, False, True], index=index) | v
            expected = Series([True, False, True], index=index)
            tm.assert_series_equal(result, expected)

        for v in [True, 1]:
            result = Series([True, False, True], index=index) & v
            expected = Series([True, False, True], index=index)
            tm.assert_series_equal(result, expected)

        for v in [False, 0]:
            result = Series([True, False, True], index=index) & v
            expected = Series([False, False, False], index=index)
            tm.assert_series_equal(result, expected)
        msg = "Cannot perform.+with a dtyped.+array and scalar of type"
        for v in [np.nan]:
            with pytest.raises(TypeError, match=msg):
                t & v
Example #51
0
def psar(high, low, close=None, af=None, max_af=None, offset=None, **kwargs):
    """Indicator: Parabolic Stop and Reverse (PSAR)"""
    # Validate Arguments
    high = verify_series(high)
    low = verify_series(low)
    af = float(af) if af and af > 0 else 0.02
    max_af = float(max_af) if max_af and max_af > 0 else 0.2
    offset = get_offset(offset)

    # Initialize
    m = high.shape[0]
    af0 = af
    bullish = True
    high_point = high.iloc[0]
    low_point = low.iloc[0]

    if close is not None:
        close = verify_series(close)
        sar = close.copy()
    else:
        sar = low.copy()

    long = Series(npNaN, index=sar.index)
    short = long.copy()
    reversal = Series(False, index=sar.index)
    _af = long.copy()
    _af.iloc[0:2] = af0

    # Calculate Result
    for i in range(2, m):
        reverse = False
        _af[i] = af

        if bullish:
            sar[i] = sar[i - 1] + af * (high_point - sar[i - 1])

            if low[i] < sar[i]:
                bullish, reverse, af = False, True, af0
                sar[i] = high_point
                low_point = low[i]
        else:
            sar[i] = sar[i - 1] + af * (low_point - sar[i - 1])

            if high[i] > sar[i]:
                bullish, reverse, af = True, True, af0
                sar[i] = low_point
                high_point = high[i]

        reversal[i] = reverse

        if not reverse:
            if bullish:
                if high[i] > high_point:
                    high_point = high[i]
                    af = min(af + af0, max_af)
                if low[i - 1] < sar[i]:
                    sar[i] = low[i - 1]
                if low[i - 2] < sar[i]:
                    sar[i] = low[i - 2]
            else:
                if low[i] < low_point:
                    low_point = low[i]
                    af = min(af + af0, max_af)
                if high[i - 1] > sar[i]:
                    sar[i] = high[i - 1]
                if high[i - 2] > sar[i]:
                    sar[i] = high[i - 2]

        if bullish:
            long[i] = sar[i]
        else:
            short[i] = sar[i]

    # Offset
    if offset != 0:
        _af = _af.shift(offset)
        long = long.shift(offset)
        short = short.shift(offset)
        reversal = reversal.shift(offset)

    # Handle fills
    if "fillna" in kwargs:
        _af.fillna(kwargs["fillna"], inplace=True)
        long.fillna(kwargs["fillna"], inplace=True)
        short.fillna(kwargs["fillna"], inplace=True)
        reversal.fillna(kwargs["fillna"], inplace=True)
    if "fill_method" in kwargs:
        _af.fillna(method=kwargs["fill_method"], inplace=True)
        long.fillna(method=kwargs["fill_method"], inplace=True)
        short.fillna(method=kwargs["fill_method"], inplace=True)
        reversal.fillna(method=kwargs["fill_method"], inplace=True)

    # Prepare DataFrame to return
    _params = f"_{af0}_{max_af}"
    data = {
        f"PSARl{_params}": long,
        f"PSARs{_params}": short,
        f"PSARaf{_params}": _af,
        f"PSARr{_params}": reversal,
    }
    psardf = DataFrame(data)
    psardf.name = f"PSAR{_params}"
    psardf.category = long.category = short.category = "trend"

    return psardf
    def test_slice_integer(self):

        # same as above, but for Integer based indexes
        # these coerce to a like integer
        # oob indicates if we are out of bounds
        # of positional indexing
        for index, oob in [
            (Int64Index(range(5)), False),
            (RangeIndex(5), False),
            (Int64Index(range(5)) + 10, True),
        ]:

            # s is an in-range index
            s = Series(range(5), index=index)

            # getitem
            for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]:

                for idxr in [lambda x: x.loc, lambda x: x.ix]:

                    with catch_warnings(record=True):
                        result = idxr(s)[l]

                    # these are all label indexing
                    # except getitem which is positional
                    # empty
                    if oob:
                        indexer = slice(0, 0)
                    else:
                        indexer = slice(3, 5)
                    self.check(result, s, indexer, False)

                # positional indexing
                msg = (
                    "cannot do slice indexing"
                    r" on {klass} with these indexers \[(3|4)\.0\] of"
                    " {kind}".format(klass=type(index), kind=str(float))
                )
                with pytest.raises(TypeError, match=msg):
                    s[l]

            # getitem out-of-bounds
            for l in [slice(-6, 6), slice(-6.0, 6.0)]:

                for idxr in [lambda x: x.loc, lambda x: x.ix]:
                    with catch_warnings(record=True):
                        result = idxr(s)[l]

                    # these are all label indexing
                    # except getitem which is positional
                    # empty
                    if oob:
                        indexer = slice(0, 0)
                    else:
                        indexer = slice(-6, 6)
                    self.check(result, s, indexer, False)

            # positional indexing
            msg = (
                "cannot do slice indexing"
                r" on {klass} with these indexers \[-6\.0\] of"
                " {kind}".format(klass=type(index), kind=str(float))
            )
            with pytest.raises(TypeError, match=msg):
                s[slice(-6.0, 6.0)]

            # getitem odd floats
            for l, res1 in [
                (slice(2.5, 4), slice(3, 5)),
                (slice(2, 3.5), slice(2, 4)),
                (slice(2.5, 3.5), slice(3, 4)),
            ]:

                for idxr in [lambda x: x.loc, lambda x: x.ix]:

                    with catch_warnings(record=True):
                        result = idxr(s)[l]
                    if oob:
                        res = slice(0, 0)
                    else:
                        res = res1

                    self.check(result, s, res, False)

                # positional indexing
                msg = (
                    "cannot do slice indexing"
                    r" on {klass} with these indexers \[(2|3)\.5\] of"
                    " {kind}".format(klass=type(index), kind=str(float))
                )
                with pytest.raises(TypeError, match=msg):
                    s[l]

            # setitem
            for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]:

                for idxr in [lambda x: x.loc, lambda x: x.ix]:
                    sc = s.copy()
                    with catch_warnings(record=True):
                        idxr(sc)[l] = 0
                        result = idxr(sc)[l].values.ravel()
                    assert (result == 0).all()

                # positional indexing
                msg = (
                    "cannot do slice indexing"
                    r" on {klass} with these indexers \[(3|4)\.0\] of"
                    " {kind}".format(klass=type(index), kind=str(float))
                )
                with pytest.raises(TypeError, match=msg):
                    s[l] = 0
Example #53
0
    def test_rank(self, datetime_series):
        from scipy.stats import rankdata

        datetime_series[::2] = np.nan
        datetime_series[:10][::3] = 4.0

        ranks = datetime_series.rank()
        oranks = datetime_series.astype("O").rank()

        tm.assert_series_equal(ranks, oranks)

        mask = np.isnan(datetime_series)
        filled = datetime_series.fillna(np.inf)

        # rankdata returns a ndarray
        exp = Series(rankdata(filled), index=filled.index, name="ts")
        exp[mask] = np.nan

        tm.assert_series_equal(ranks, exp)

        iseries = Series(np.arange(5).repeat(2))

        iranks = iseries.rank()
        exp = iseries.astype(float).rank()
        tm.assert_series_equal(iranks, exp)
        iseries = Series(np.arange(5)) + 1.0
        exp = iseries / 5.0
        iranks = iseries.rank(pct=True)

        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(1, 100))
        exp = Series(np.repeat(0.505, 100))
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries[1] = np.nan
        exp = Series(np.repeat(50.0 / 99.0, 100))
        exp[1] = np.nan
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1.0
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(np.nan, 100))
        exp = iseries.copy()
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        rng = date_range("1/1/1990", periods=5)
        iseries = Series(np.arange(5), rng) + 1
        iseries.iloc[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1])
        exp = Series([2, 1, 3, 5, 4, 6.0])
        iranks = iseries.rank()
        tm.assert_series_equal(iranks, exp)

        # GH 5968
        iseries = Series(["3 day", "1 day 10m", "-2 day", NaT], dtype="m8[ns]")
        exp = Series([3, 2, 1, np.nan])
        iranks = iseries.rank()
        tm.assert_series_equal(iranks, exp)

        values = np.array(
            [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40],
            dtype="float64",
        )
        random_order = np.random.permutation(len(values))
        iseries = Series(values[random_order])
        exp = Series(random_order + 1.0, dtype="float64")
        iranks = iseries.rank()
        tm.assert_series_equal(iranks, exp)
Example #54
0
    def test_drop_duplicates_categorical_non_bool(self, dtype, ordered):
        cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))

        # Test case 1
        input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype))
        tc1 = Series(Categorical(input1, categories=cat_array,
                                 ordered=ordered))
        if dtype == "datetime64[D]":
            # pre-empty flaky xfail, tc1 values are seemingly-random
            if not (np.array(tc1) == input1).all():
                pytest.xfail(reason="GH#7996")

        expected = Series([False, False, False, True])
        tm.assert_series_equal(tc1.duplicated(), expected)
        tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected])
        sc = tc1.copy()
        sc.drop_duplicates(inplace=True)
        tm.assert_series_equal(sc, tc1[~expected])

        expected = Series([False, False, True, False])
        tm.assert_series_equal(tc1.duplicated(keep="last"), expected)
        tm.assert_series_equal(tc1.drop_duplicates(keep="last"),
                               tc1[~expected])
        sc = tc1.copy()
        sc.drop_duplicates(keep="last", inplace=True)
        tm.assert_series_equal(sc, tc1[~expected])

        expected = Series([False, False, True, True])
        tm.assert_series_equal(tc1.duplicated(keep=False), expected)
        tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected])
        sc = tc1.copy()
        sc.drop_duplicates(keep=False, inplace=True)
        tm.assert_series_equal(sc, tc1[~expected])

        # Test case 2
        input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
        tc2 = Series(Categorical(input2, categories=cat_array,
                                 ordered=ordered))
        if dtype == "datetime64[D]":
            # pre-empty flaky xfail, tc2 values are seemingly-random
            if not (np.array(tc2) == input2).all():
                pytest.xfail(reason="GH#7996")

        expected = Series([False, False, False, False, True, True, False])
        tm.assert_series_equal(tc2.duplicated(), expected)
        tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])
        sc = tc2.copy()
        sc.drop_duplicates(inplace=True)
        tm.assert_series_equal(sc, tc2[~expected])

        expected = Series([False, True, True, False, False, False, False])
        tm.assert_series_equal(tc2.duplicated(keep="last"), expected)
        tm.assert_series_equal(tc2.drop_duplicates(keep="last"),
                               tc2[~expected])
        sc = tc2.copy()
        sc.drop_duplicates(keep="last", inplace=True)
        tm.assert_series_equal(sc, tc2[~expected])

        expected = Series([False, True, True, False, True, True, False])
        tm.assert_series_equal(tc2.duplicated(keep=False), expected)
        tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected])
        sc = tc2.copy()
        sc.drop_duplicates(keep=False, inplace=True)
        tm.assert_series_equal(sc, tc2[~expected])
Example #55
0
    def test_rank(self):
        pytest.importorskip('scipy.stats.special')
        rankdata = pytest.importorskip('scipy.stats.rankdata')

        self.ts[::2] = np.nan
        self.ts[:10][::3] = 4.

        ranks = self.ts.rank()
        oranks = self.ts.astype('O').rank()

        assert_series_equal(ranks, oranks)

        mask = np.isnan(self.ts)
        filled = self.ts.fillna(np.inf)

        # rankdata returns a ndarray
        exp = Series(rankdata(filled), index=filled.index, name='ts')
        exp[mask] = np.nan

        tm.assert_series_equal(ranks, exp)

        iseries = Series(np.arange(5).repeat(2))

        iranks = iseries.rank()
        exp = iseries.astype(float).rank()
        assert_series_equal(iranks, exp)
        iseries = Series(np.arange(5)) + 1.0
        exp = iseries / 5.0
        iranks = iseries.rank(pct=True)

        assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(1, 100))
        exp = Series(np.repeat(0.505, 100))
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries[1] = np.nan
        exp = Series(np.repeat(50.0 / 99.0, 100))
        exp[1] = np.nan
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1.0
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(np.nan, 100))
        exp = iseries.copy()
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        rng = date_range('1/1/1990', periods=5)
        iseries = Series(np.arange(5), rng) + 1
        iseries.iloc[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1])
        exp = Series([2, 1, 3, 5, 4, 6.0])
        iranks = iseries.rank()
        assert_series_equal(iranks, exp)

        # GH 5968
        iseries = Series(['3 day', '1 day 10m', '-2 day', NaT], dtype='m8[ns]')
        exp = Series([3, 2, 1, np.nan])
        iranks = iseries.rank()
        assert_series_equal(iranks, exp)

        values = np.array(
            [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40],
            dtype='float64')
        random_order = np.random.permutation(len(values))
        iseries = Series(values[random_order])
        exp = Series(random_order + 1.0, dtype='float64')
        iranks = iseries.rank()
        assert_series_equal(iranks, exp)
Example #56
0
def QA_fetch_get_factor_groupby(factor: pd.Series,
                                industry_cls: str = "sw_l1",
                                detailed: bool = False) -> pd.DataFrame:
    """
    获取因子的行业暴露, 注意,返回的值是 pd.DataFrame 格式,包含原因子值,附加一列
    因子对应的行业信息 (需先自行导入聚宽本地 sdk 并登陆)

    参数
    ---
    :param factor: 因子值,索引为 ['日期' '资产']
    :param industry_cls: 行业分类,默认为申万 1 级行业
    :param detailed: 是否使用详细模式,默认为 False, 即取因子日期最后一日的行业信息

    返回值
    ---
    :return: 因子数据, 包括因子值,因子对应行业
    """
    warnings.warn("请先自行导入聚宽本地 sdk 并登陆", UserWarning)
    # 因子格式化
    factor = QA_fmt_factor(factor)
    merged_data = pd.DataFrame(factor.copy().rename("factor"))
    # 股票代码格式化
    stock_list = QA_fmt_code_list(
        factor.index.get_level_values("code").drop_duplicates(), style="jq")
    # 非详细模式, 行业数据采用当前日期
    if detailed:
        # start_time = str(min(factor.index.get_level_values("datetime")))[:10]
        # end_time = str(max(factor.index.get_level_values("datetime")))[:10]
        # date_range = list(
        #     map(pd.Timestamp, QA_util_get_trade_range(start_time, end_time))
        # )
        date_range = (factor.index.get_level_values(
            "datetime").drop_duplicates().tolist())

        df_local = pd.DataFrame()
        industries = map(partial(jqdatasdk.get_industry, stock_list),
                         date_range)
        industries = {
            d: {
                s: ind.get(s).get(industry_cls,
                                  dict()).get("industry_name", "NA")
                for s in stock_list
            }
            for d, ind in zip(date_range, industries)
        }
    else:
        end_time = str(max(factor.index.get_level_values("datetime")))[:10]
        date_range = [pd.Timestamp(end_time)]
        industries = jqdatasdk.get_industry(stock_list, end_time)
        industries = {
            d: {
                s: industries.get(s).get(industry_cls,
                                         dict()).get("industry_name", "NA")
                for s in stock_list
            }
            for d in date_range
        }
    # 可能历史上股票没有行业信息,用之后的行业信息往前填充
    df_local = pd.DataFrame(industries).T.sort_index()
    df_local.columns = df_local.columns.map(str).str.slice(0, 6)
    ss_local = df_local.stack(level=-1)
    ss_local.index.names = ["date", "code"]
    merged_data["date"] = merged_data.index.get_level_values("datetime").map(
        lambda x: x.date())
    merged_data = (merged_data.reset_index().set_index([
        "date", "code"
    ]).assign(group=ss_local).reset_index().set_index(["datetime",
                                                       "code"]).drop("date",
                                                                     axis=1))
    group = merged_data["group"].unstack().bfill().stack()
    merged_data["group"] = group
    return merged_data
Example #57
0
    def _plot_discrete(
        self,
        data: pd.Series,
        prop: str,
        lineages: Optional[Union[str, Sequence[str]]] = None,
        cluster_key: Optional[str] = None,
        same_plot: bool = True,
        title: Optional[Union[str, List[str]]] = None,
        **kwargs,
    ) -> None:
        """
        Plot the states for each uncovered lineage.

        Parameters
        ----------
        lineages
            Plot only these lineages. If `None`, plot all lineages.
        cluster_key
            Key from :paramref:`adata` ``.obs`` for plotting categorical observations.
        same_plot
            Whether to plot the lineages on the same plot or separately.
        title
            The title of the plot.
        %(basis)s
        **kwargs
            Keyword arguments for :func:`scvelo.pl.scatter`.

        Returns
        -------
        %(just_plots)s
        """

        if data is None:
            raise RuntimeError(
                f"Compute `.{prop}` first as `.{F.COMPUTE.fmt(prop)}()`.")
        if not is_categorical_dtype(data):
            raise TypeError(
                f"Expected property `.{prop}` to be categorical, found `{type(data).__name__!r}`."
            )
        if prop in (P.ABS_PROBS.s, P.TERM.s):
            colors = getattr(self, A.TERM_COLORS.v, None)
        elif prop == P.MACRO.v:
            colors = getattr(self, A.MACRO_COLORS.v, None)
        else:
            logg.debug("No colors found. Creating new ones")
            colors = _create_categorical_colors(len(data.cat.categories))
        colors = dict(zip(data.cat.categories, colors))

        if (
                lineages is not None
        ):  # these are states per-se, but I want to keep the arg names for dispatch the same
            if isinstance(lineages, str):
                lineages = [lineages]
            for state in lineages:
                if state not in data.cat.categories:
                    raise ValueError(
                        f"Invalid state `{state!r}`. Valid options are `{list(data.cat.categories)}`."
                    )
            data = data.copy()
            to_remove = list(set(data.cat.categories) - set(lineages))

            if len(to_remove) == len(data.cat.categories):
                raise RuntimeError(
                    "Nothing to plot because empty subset has been selected.")

            for state in to_remove:
                data[data == state] = np.nan
            data.cat.remove_categories(to_remove, inplace=True)

        if cluster_key is None:
            cluster_key = []
        elif isinstance(cluster_key, str):
            cluster_key = [cluster_key]
        if not isinstance(cluster_key, list):
            cluster_key = list(cluster_key)

        same_plot = same_plot or len(data.cat.categories) == 1
        kwargs["legend_loc"] = kwargs.get("legend_loc", "on data")

        with RandomKeys(self.adata,
                        None if same_plot else len(data.cat.categories),
                        where="obs") as keys:
            if same_plot:
                key = keys[0]
                self.adata.obs[key] = data
                self.adata.uns[f"{key}_colors"] = [
                    colors[c] for c in data.cat.categories
                ]

                if title is None:
                    title = (
                        f"{prop.replace('_', ' ')} "
                        f"({Direction.BACKWARD if self.kernel.backward else Direction.FORWARD})"
                    )
                if isinstance(title, str):
                    title = [title]

                scv.pl.scatter(
                    self.adata,
                    title=cluster_key + title,
                    color=cluster_key + keys,
                    **_filter_kwargs(scv.pl.scatter, **kwargs),
                )
            else:
                for key, cat in zip(keys, data.cat.categories):
                    d = data.copy()
                    d[data != cat] = None
                    d.cat.set_categories([cat], inplace=True)

                    self.adata.obs[key] = d
                    self.adata.uns[f"{key}_colors"] = [colors[cat]]

                scv.pl.scatter(
                    self.adata,
                    color=cluster_key + keys,
                    title=(cluster_key + [
                        f"{_initial if self.kernel.backward else _terminal} state {c}"
                        for c in data.cat.categories
                    ]) if title is None else title,
                    **_filter_kwargs(scv.pl.scatter, **kwargs),
                )
Example #58
0
    def test_partial_setting(self):

        # GH2578, allow ix and friends to partially set

        # series
        s_orig = Series([1, 2, 3])

        s = s_orig.copy()
        s[5] = 5
        expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s.loc[5] = 5
        expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s[5] = 5.
        expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s.loc[5] = 5.
        expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        # iloc/iat raise
        s = s_orig.copy()

        with pytest.raises(IndexError):
            s.iloc[3] = 5.

        with pytest.raises(IndexError):
            s.iat[3] = 5.

        # ## frame ##

        df_orig = DataFrame(np.arange(6).reshape(3, 2),
                            columns=['A', 'B'],
                            dtype='int64')

        # iloc/iat raise
        df = df_orig.copy()

        with pytest.raises(IndexError):
            df.iloc[4, 2] = 5.

        with pytest.raises(IndexError):
            df.iat[4, 2] = 5.

        # row setting where it exists
        expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]}))
        df = df_orig.copy()
        df.iloc[1] = df.iloc[2]
        tm.assert_frame_equal(df, expected)

        expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]}))
        df = df_orig.copy()
        df.loc[1] = df.loc[2]
        tm.assert_frame_equal(df, expected)

        # like 2578, partial setting with dtype preservation
        expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]}))
        df = df_orig.copy()
        df.loc[3] = df.loc[2]
        tm.assert_frame_equal(df, expected)

        # single dtype frame, overwrite
        expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]}))
        df = df_orig.copy()
        with catch_warnings(record=True):
            df.ix[:, 'B'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # mixed dtype frame, overwrite
        expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])}))
        df = df_orig.copy()
        df['B'] = df['B'].astype(np.float64)
        with catch_warnings(record=True):
            df.ix[:, 'B'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # single dtype frame, partial setting
        expected = df_orig.copy()
        expected['C'] = df['A']
        df = df_orig.copy()
        with catch_warnings(record=True):
            df.ix[:, 'C'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # mixed frame, partial setting
        expected = df_orig.copy()
        expected['C'] = df['A']
        df = df_orig.copy()
        with catch_warnings(record=True):
            df.ix[:, 'C'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # GH 8473
        dates = date_range('1/1/2000', periods=8)
        df_orig = DataFrame(np.random.randn(8, 4),
                            index=dates,
                            columns=['A', 'B', 'C', 'D'])

        expected = pd.concat(
            [df_orig,
             DataFrame({'A': 7}, index=[dates[-1] + dates.freq])],
            sort=True)
        df = df_orig.copy()
        df.loc[dates[-1] + dates.freq, 'A'] = 7
        tm.assert_frame_equal(df, expected)
        df = df_orig.copy()
        df.at[dates[-1] + dates.freq, 'A'] = 7
        tm.assert_frame_equal(df, expected)

        exp_other = DataFrame({0: 7}, index=[dates[-1] + dates.freq])
        expected = pd.concat([df_orig, exp_other], axis=1)

        df = df_orig.copy()
        df.loc[dates[-1] + dates.freq, 0] = 7
        tm.assert_frame_equal(df, expected)
        df = df_orig.copy()
        df.at[dates[-1] + dates.freq, 0] = 7
        tm.assert_frame_equal(df, expected)
Example #59
0
    def test_convert(self):
        # Tests: All to nans, coerce, true
        # Test coercion returns correct type
        s = Series(["a", "b", "c"])
        results = s._convert(datetime=True, coerce=True)
        expected = Series([NaT] * 3)
        tm.assert_series_equal(results, expected)

        results = s._convert(numeric=True, coerce=True)
        expected = Series([np.nan] * 3)
        tm.assert_series_equal(results, expected)

        expected = Series([NaT] * 3, dtype=np.dtype("m8[ns]"))
        results = s._convert(timedelta=True, coerce=True)
        tm.assert_series_equal(results, expected)

        dt = datetime(2001, 1, 1, 0, 0)
        td = dt - datetime(2000, 1, 1, 0, 0)

        # Test coercion with mixed types
        s = Series(["a", "3.1415", dt, td])
        results = s._convert(datetime=True, coerce=True)
        expected = Series([NaT, NaT, dt, NaT])
        tm.assert_series_equal(results, expected)

        results = s._convert(numeric=True, coerce=True)
        expected = Series([np.nan, 3.1415, np.nan, np.nan])
        tm.assert_series_equal(results, expected)

        results = s._convert(timedelta=True, coerce=True)
        expected = Series([NaT, NaT, NaT, td], dtype=np.dtype("m8[ns]"))
        tm.assert_series_equal(results, expected)

        # Test standard conversion returns original
        results = s._convert(datetime=True)
        tm.assert_series_equal(results, s)
        results = s._convert(numeric=True)
        expected = Series([np.nan, 3.1415, np.nan, np.nan])
        tm.assert_series_equal(results, expected)
        results = s._convert(timedelta=True)
        tm.assert_series_equal(results, s)

        # test pass-through and non-conversion when other types selected
        s = Series(["1.0", "2.0", "3.0"])
        results = s._convert(datetime=True, numeric=True, timedelta=True)
        expected = Series([1.0, 2.0, 3.0])
        tm.assert_series_equal(results, expected)
        results = s._convert(True, False, True)
        tm.assert_series_equal(results, s)

        s = Series([datetime(2001, 1, 1, 0, 0),
                    datetime(2001, 1, 1, 0, 0)],
                   dtype="O")
        results = s._convert(datetime=True, numeric=True, timedelta=True)
        expected = Series(
            [datetime(2001, 1, 1, 0, 0),
             datetime(2001, 1, 1, 0, 0)])
        tm.assert_series_equal(results, expected)
        results = s._convert(datetime=False, numeric=True, timedelta=True)
        tm.assert_series_equal(results, s)

        td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0)
        s = Series([td, td], dtype="O")
        results = s._convert(datetime=True, numeric=True, timedelta=True)
        expected = Series([td, td])
        tm.assert_series_equal(results, expected)
        results = s._convert(True, True, False)
        tm.assert_series_equal(results, s)

        s = Series([1.0, 2, 3], index=["a", "b", "c"])
        result = s._convert(numeric=True)
        tm.assert_series_equal(result, s)

        # force numeric conversion
        r = s.copy().astype("O")
        r["a"] = "1"
        result = r._convert(numeric=True)
        tm.assert_series_equal(result, s)

        r = s.copy().astype("O")
        r["a"] = "1."
        result = r._convert(numeric=True)
        tm.assert_series_equal(result, s)

        r = s.copy().astype("O")
        r["a"] = "garbled"
        result = r._convert(numeric=True)
        expected = s.copy()
        expected["a"] = np.nan
        tm.assert_series_equal(result, expected)

        # GH 4119, not converting a mixed type (e.g.floats and object)
        s = Series([1, "na", 3, 4])
        result = s._convert(datetime=True, numeric=True)
        expected = Series([1, np.nan, 3, 4])
        tm.assert_series_equal(result, expected)

        s = Series([1, "", 3, 4])
        result = s._convert(datetime=True, numeric=True)
        tm.assert_series_equal(result, expected)

        # dates
        s = Series([
            datetime(2001, 1, 1, 0, 0),
            datetime(2001, 1, 2, 0, 0),
            datetime(2001, 1, 3, 0, 0),
        ])
        s2 = Series(
            [
                datetime(2001, 1, 1, 0, 0),
                datetime(2001, 1, 2, 0, 0),
                datetime(2001, 1, 3, 0, 0),
                "foo",
                1.0,
                1,
                Timestamp("20010104"),
                "20010105",
            ],
            dtype="O",
        )

        result = s._convert(datetime=True)
        expected = Series(
            [
                Timestamp("20010101"),
                Timestamp("20010102"),
                Timestamp("20010103")
            ],
            dtype="M8[ns]",
        )
        tm.assert_series_equal(result, expected)

        result = s._convert(datetime=True, coerce=True)
        tm.assert_series_equal(result, expected)

        expected = Series(
            [
                Timestamp("20010101"),
                Timestamp("20010102"),
                Timestamp("20010103"),
                NaT,
                NaT,
                NaT,
                Timestamp("20010104"),
                Timestamp("20010105"),
            ],
            dtype="M8[ns]",
        )
        result = s2._convert(datetime=True,
                             numeric=False,
                             timedelta=False,
                             coerce=True)
        tm.assert_series_equal(result, expected)
        result = s2._convert(datetime=True, coerce=True)
        tm.assert_series_equal(result, expected)

        s = Series(["foo", "bar", 1, 1.0], dtype="O")
        result = s._convert(datetime=True, coerce=True)
        expected = Series([NaT] * 2 + [Timestamp(1)] * 2)
        tm.assert_series_equal(result, expected)

        # preserver if non-object
        s = Series([1], dtype="float32")
        result = s._convert(datetime=True, coerce=True)
        tm.assert_series_equal(result, s)

        # FIXME: dont leave commented-out
        # r = s.copy()
        # r[0] = np.nan
        # result = r._convert(convert_dates=True,convert_numeric=False)
        # assert result.dtype == 'M8[ns]'

        # dateutil parses some single letters into today's value as a date
        expected = Series([NaT])
        for x in "abcdefghijklmnopqrstuvwxyz":
            s = Series([x])
            result = s._convert(datetime=True, coerce=True)
            tm.assert_series_equal(result, expected)
            s = Series([x.upper()])
            result = s._convert(datetime=True, coerce=True)
            tm.assert_series_equal(result, expected)
    def test_convert(self):
        # GH#10265
        dt = datetime(2001, 1, 1, 0, 0)
        td = dt - datetime(2000, 1, 1, 0, 0)

        # Test coercion with mixed types
        ser = Series(["a", "3.1415", dt, td])

        results = ser._convert(numeric=True)
        expected = Series([np.nan, 3.1415, np.nan, np.nan])
        tm.assert_series_equal(results, expected)

        # Test standard conversion returns original
        results = ser._convert(datetime=True)
        tm.assert_series_equal(results, ser)
        results = ser._convert(numeric=True)
        expected = Series([np.nan, 3.1415, np.nan, np.nan])
        tm.assert_series_equal(results, expected)
        results = ser._convert(timedelta=True)
        tm.assert_series_equal(results, ser)

        # test pass-through and non-conversion when other types selected
        ser = Series(["1.0", "2.0", "3.0"])
        results = ser._convert(datetime=True, numeric=True, timedelta=True)
        expected = Series([1.0, 2.0, 3.0])
        tm.assert_series_equal(results, expected)
        results = ser._convert(True, False, True)
        tm.assert_series_equal(results, ser)

        ser = Series(
            [datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)], dtype="O"
        )
        results = ser._convert(datetime=True, numeric=True, timedelta=True)
        expected = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)])
        tm.assert_series_equal(results, expected)
        results = ser._convert(datetime=False, numeric=True, timedelta=True)
        tm.assert_series_equal(results, ser)

        td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0)
        ser = Series([td, td], dtype="O")
        results = ser._convert(datetime=True, numeric=True, timedelta=True)
        expected = Series([td, td])
        tm.assert_series_equal(results, expected)
        results = ser._convert(True, True, False)
        tm.assert_series_equal(results, ser)

        ser = Series([1.0, 2, 3], index=["a", "b", "c"])
        result = ser._convert(numeric=True)
        tm.assert_series_equal(result, ser)

        # force numeric conversion
        res = ser.copy().astype("O")
        res["a"] = "1"
        result = res._convert(numeric=True)
        tm.assert_series_equal(result, ser)

        res = ser.copy().astype("O")
        res["a"] = "1."
        result = res._convert(numeric=True)
        tm.assert_series_equal(result, ser)

        res = ser.copy().astype("O")
        res["a"] = "garbled"
        result = res._convert(numeric=True)
        expected = ser.copy()
        expected["a"] = np.nan
        tm.assert_series_equal(result, expected)

        # GH 4119, not converting a mixed type (e.g.floats and object)
        ser = Series([1, "na", 3, 4])
        result = ser._convert(datetime=True, numeric=True)
        expected = Series([1, np.nan, 3, 4])
        tm.assert_series_equal(result, expected)

        ser = Series([1, "", 3, 4])
        result = ser._convert(datetime=True, numeric=True)
        tm.assert_series_equal(result, expected)

        # dates
        ser = Series(
            [
                datetime(2001, 1, 1, 0, 0),
                datetime(2001, 1, 2, 0, 0),
                datetime(2001, 1, 3, 0, 0),
            ]
        )

        result = ser._convert(datetime=True)
        expected = Series(
            [Timestamp("20010101"), Timestamp("20010102"), Timestamp("20010103")],
            dtype="M8[ns]",
        )
        tm.assert_series_equal(result, expected)

        result = ser._convert(datetime=True)
        tm.assert_series_equal(result, expected)

        # preserver if non-object
        ser = Series([1], dtype="float32")
        result = ser._convert(datetime=True)
        tm.assert_series_equal(result, ser)