def test_apply(self): with np.errstate(all='ignore'): # ufunc applied = self.frame.apply(np.sqrt) assert_series_equal(np.sqrt(self.frame['A']), applied['A']) # aggregator applied = self.frame.apply(np.mean) self.assertEqual(applied['A'], np.mean(self.frame['A'])) d = self.frame.index[0] applied = self.frame.apply(np.mean, axis=1) self.assertEqual(applied[d], np.mean(self.frame.xs(d))) self.assertIs(applied.index, self.frame.index) # want this # invalid axis df = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) self.assertRaises(ValueError, df.apply, lambda x: x, 2) # GH9573 df = DataFrame({'c0': ['A', 'A', 'B', 'B'], 'c1': ['C', 'C', 'D', 'D']}) df = df.apply(lambda ts: ts.astype('category')) self.assertEqual(df.shape, (4, 2)) self.assertTrue(isinstance(df['c0'].dtype, CategoricalDtype)) self.assertTrue(isinstance(df['c1'].dtype, CategoricalDtype))
def test_query_python(self): df = self.df result = df.query('A>0', engine='python') assert_frame_equal(result, self.expected1) result = df.eval('A+1', engine='python') assert_series_equal(result, self.expected2, check_names=False)
def test_sort_index_different_sortorder(self): A = np.arange(20).repeat(5) B = np.tile(np.arange(5), 20) indexer = np.random.permutation(100) A = A.take(indexer) B = B.take(indexer) df = DataFrame({'A': A, 'B': B, 'C': np.random.randn(100)}) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by=['A', 'B'], ascending=[1, 0]) result = df.sort_values(by=['A', 'B'], ascending=[1, 0]) ex_indexer = np.lexsort((df.B.max() - df.B, df.A)) expected = df.take(ex_indexer) assert_frame_equal(result, expected) # test with multiindex, too idf = df.set_index(['A', 'B']) result = idf.sort_index(ascending=[1, 0]) expected = idf.take(ex_indexer) assert_frame_equal(result, expected) # also, Series! result = idf['C'].sort_index(ascending=[1, 0]) assert_series_equal(result, expected['C'])
def test_secondary_y(self): import matplotlib.pyplot as plt ser = Series(np.random.randn(10)) ser2 = Series(np.random.randn(10)) ax = ser.plot(secondary_y=True) self.assertTrue(hasattr(ax, 'left_ax')) self.assertFalse(hasattr(ax, 'right_ax')) fig = ax.get_figure() axes = fig.get_axes() l = ax.get_lines()[0] xp = Series(l.get_ydata(), l.get_xdata()) assert_series_equal(ser, xp) self.assertEqual(ax.get_yaxis().get_ticks_position(), 'right') self.assertFalse(axes[0].get_yaxis().get_visible()) plt.close(fig) ax2 = ser2.plot() self.assertEqual(ax2.get_yaxis().get_ticks_position(), 'default') plt.close(ax2.get_figure()) ax = ser2.plot() ax2 = ser.plot(secondary_y=True) self.assertTrue(ax.get_yaxis().get_visible()) self.assertFalse(hasattr(ax, 'left_ax')) self.assertTrue(hasattr(ax, 'right_ax')) self.assertTrue(hasattr(ax2, 'left_ax')) self.assertFalse(hasattr(ax2, 'right_ax'))
def test_astype_with_tz(self): # with tz rng = date_range('1/1/2000', periods=10, tz='US/Eastern') result = rng.astype('datetime64[ns]') expected = (date_range('1/1/2000', periods=10, tz='US/Eastern') .tz_convert('UTC').tz_localize(None)) tm.assert_index_equal(result, expected) # BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex result = pd.Series(pd.date_range('2012-01-01', periods=3)).astype(str) expected = pd.Series( ['2012-01-01', '2012-01-02', '2012-01-03'], dtype=object) tm.assert_series_equal(result, expected) result = Series(pd.date_range('2012-01-01', periods=3, tz='US/Eastern')).astype(str) expected = Series(['2012-01-01 00:00:00-05:00', '2012-01-02 00:00:00-05:00', '2012-01-03 00:00:00-05:00'], dtype=object) tm.assert_series_equal(result, expected) # GH 18951: tz-aware to tz-aware idx = date_range('20170101', periods=4, tz='US/Pacific') result = idx.astype('datetime64[ns, US/Eastern]') expected = date_range('20170101 03:00:00', periods=4, tz='US/Eastern') tm.assert_index_equal(result, expected) # GH 18951: tz-naive to tz-aware idx = date_range('20170101', periods=4) result = idx.astype('datetime64[ns, US/Eastern]') expected = date_range('20170101', periods=4, tz='US/Eastern') tm.assert_index_equal(result, expected)
def test_timegrouper_with_reg_groups_freq(self, freq): # GH 6764 multiple grouping with/without sort df = DataFrame({ 'date': pd.to_datetime([ '20121002', '20121007', '20130130', '20130202', '20130305', '20121002', '20121207', '20130130', '20130202', '20130305', '20130202', '20130305' ]), 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], 'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301, 359, 801], 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12] }).set_index('date') expected = ( df.groupby('user_id')['whole_cost'] .resample(freq) .sum(min_count=1) # XXX .dropna() .reorder_levels(['date', 'user_id']) .sort_index() .astype('int64') ) expected.name = 'whole_cost' result1 = df.sort_index().groupby([pd.Grouper(freq=freq), 'user_id'])['whole_cost'].sum() assert_series_equal(result1, expected) result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[ 'whole_cost'].sum() assert_series_equal(result2, expected)
def test_groupby_max_datetime64(self): # GH 5869 # datetimelike dtype conversion from int df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) expected = df.groupby('A')['A'].apply(lambda x: x.max()) result = df.groupby('A')['A'].max() assert_series_equal(result, expected)
def test_setitem_ndarray_1d(self): # GH5508 # len of indexer vs length of the 1d ndarray df = DataFrame(index=Index(lrange(1, 11))) df['foo'] = np.zeros(10, dtype=np.float64) df['bar'] = np.zeros(10, dtype=np.complex) # invalid def f(): df.loc[df.index[2:5], 'bar'] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) pytest.raises(ValueError, f) # valid df.loc[df.index[2:6], 'bar'] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) result = df.loc[df.index[2:6], 'bar'] expected = Series([2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6], name='bar') tm.assert_series_equal(result, expected) # dtype getting changed? df = DataFrame(index=Index(lrange(1, 11))) df['foo'] = np.zeros(10, dtype=np.float64) df['bar'] = np.zeros(10, dtype=np.complex) def f(): df[2:5] = np.arange(1, 4) * 1j pytest.raises(ValueError, f)
def test_coercion_with_loc_and_series(self): for start_data, expected_result in self.EXPECTED_RESULTS: start_series = Series(start_data) start_series.loc[start_series == start_series[0]] = None expected_series = Series(expected_result) tm.assert_series_equal(start_series, expected_series)
def test_loc_getitem_nested_indexer(self, indexer_type_1, indexer_type_2): # GH #19686 # .loc should work with nested indexers which can be # any list-like objects (see `pandas.api.types.is_list_like`) or slices def convert_nested_indexer(indexer_type, keys): if indexer_type == np.ndarray: return np.array(keys) if indexer_type == slice: return slice(*keys) return indexer_type(keys) a = [10, 20, 30] b = [1, 2, 3] index = pd.MultiIndex.from_product([a, b]) df = pd.DataFrame( np.arange(len(index), dtype='int64'), index=index, columns=['Data']) keys = ([10, 20], [2, 3]) types = (indexer_type_1, indexer_type_2) # check indexers with all the combinations of nested objects # of all the valid types indexer = tuple( convert_nested_indexer(indexer_type, k) for indexer_type, k in zip(types, keys)) result = df.loc[indexer, 'Data'] expected = pd.Series( [1, 2, 4, 5], name='Data', index=pd.MultiIndex.from_product(keys)) tm.assert_series_equal(result, expected)
def test_crosstab_margins(self): a = np.random.randint(0, 7, size=100) b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 5, size=100) df = DataFrame({'a': a, 'b': b, 'c': c}) result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), margins=True) self.assertEqual(result.index.names, ('a',)) self.assertEqual(result.columns.names, ['b', 'c']) all_cols = result['All', ''] exp_cols = df.groupby(['a']).size().astype('i8') exp_cols = exp_cols.append(Series([len(df)], index=['All'])) tm.assert_series_equal(all_cols, exp_cols) all_rows = result.ix['All'] exp_rows = df.groupby(['b', 'c']).size().astype('i8') exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')])) exp_rows = exp_rows.reindex(all_rows.index) exp_rows = exp_rows.fillna(0).astype(np.int64) tm.assert_series_equal(all_rows, exp_rows)
def test_constructor_numpy_scalar(self): # GH 19342 # construction with a numpy scalar # should not raise result = Series(np.array(100), index=np.arange(4), dtype='int64') expected = Series(100, index=np.arange(4), dtype='int64') tm.assert_series_equal(result, expected)
def test_constructor_dict_of_tuples(self): data = {(1, 2): 3, (None, 5): 6} result = Series(data).sort_values() expected = Series([3, 6], index=MultiIndex.from_tuples([(1, 2), (None, 5)])) tm.assert_series_equal(result, expected)
def test_constructor_mixed_tz(self): s = Series([Timestamp('20130101'), Timestamp('20130101', tz='US/Eastern')]) expected = Series([Timestamp('20130101'), Timestamp('20130101', tz='US/Eastern')], dtype='object') assert_series_equal(s, expected)
def test_constructor_series(self): index1 = ['d', 'b', 'a', 'c'] index2 = sorted(index1) s1 = Series([4, 7, -5, 3], index=index1) s2 = Series(s1, index=index2) assert_series_equal(s2, s1.sort_index())
def test_make_choices_real_probs(random_seed, utilities): probs = mnl.utils_to_probs(utilities) choices = mnl.make_choices(probs) pdt.assert_series_equal( choices, pd.Series([1, 2], index=[0, 1]))
def test_groupby_to_scalar_to_series_1(self): s = pd.Series([1, 2, 3, 4]) labels = ['a', 'a', 'b', 'b'] benchmark = s.groupby(labels).apply(max) result = pandas_easy.groupby_to_scalar_to_series( s, max, 1, by=labels) assert_series_equal(result, benchmark)
def test_dirint_min_cos_zenith_max_zenith(): # map out behavior under difficult conditions with various # limiting kwargs settings # times don't have any physical relevance times = pd.DatetimeIndex(['2014-06-24T12-0700','2014-06-24T18-0700']) ghi = pd.Series([0, 1], index=times) solar_zenith = pd.Series([90, 89.99], index=times) out = irradiance.dirint(ghi, solar_zenith, times) expected = pd.Series([0.0, 0.0], index=times, name='dni') assert_series_equal(out, expected) out = irradiance.dirint(ghi, solar_zenith, times, min_cos_zenith=0) expected = pd.Series([0.0, 0.0], index=times, name='dni') assert_series_equal(out, expected) out = irradiance.dirint(ghi, solar_zenith, times, max_zenith=90) expected = pd.Series([0.0, 0.0], index=times, name='dni') assert_series_equal(out, expected, check_less_precise=True) out = irradiance.dirint(ghi, solar_zenith, times, min_cos_zenith=0, max_zenith=90) expected = pd.Series([0.0, 144.264507], index=times, name='dni') assert_series_equal(out, expected, check_less_precise=True) out = irradiance.dirint(ghi, solar_zenith, times, min_cos_zenith=0, max_zenith=100) expected = pd.Series([0.0, 144.264507], index=times, name='dni') assert_series_equal(out, expected, check_less_precise=True)
def test_clearness_index_zenith_independent(airmass_kt): clearness_index = np.array([-1, 0, .1, 1]) clearness_index, airmass_kt = np.meshgrid(clearness_index, airmass_kt) out = irradiance.clearness_index_zenith_independent(clearness_index, airmass_kt) expected = np.array( [[0. , 0. , 0.1 , 1. ], [0. , 0. , 0.138, 1.383], [0. , 0. , 0.182, 1.822], [0. , 0. , 0.212, 2. ]]) assert_allclose(out, expected, atol=0.001) # test max_clearness_index out = irradiance.clearness_index_zenith_independent( clearness_index, airmass_kt, max_clearness_index=0.82) expected = np.array( [[ 0. , 0. , 0.1 , 0.82 ], [ 0. , 0. , 0.138, 0.82 ], [ 0. , 0. , 0.182, 0.82 ], [ 0. , 0. , 0.212, 0.82 ]]) assert_allclose(out, expected, atol=0.001) # scalars out = irradiance.clearness_index_zenith_independent(.4, 2) expected = 0.443 assert_allclose(out, expected, atol=0.001) # series times = pd.DatetimeIndex(start='20180601', periods=2, freq='12H') clearness_index = pd.Series([0, .5], index=times) airmass = pd.Series([np.nan, 2], index=times) out = irradiance.clearness_index_zenith_independent(clearness_index, airmass) expected = pd.Series([np.nan, 0.553744437562], index=times) assert_series_equal(out, expected)
def test_quarterly_resampling(self): rng = period_range('2000Q1', periods=10, freq='Q-DEC') ts = Series(np.arange(10), index=rng) result = ts.resample('A') exp = ts.to_timestamp().resample('A').to_period() assert_series_equal(result, exp)
def test_perez_components(irrad_data, ephem_data, dni_et, relative_airmass): dni = irrad_data['dni'].copy() dni.iloc[2] = np.nan out = irradiance.perez(40, 180, irrad_data['dhi'], dni, dni_et, ephem_data['apparent_zenith'], ephem_data['azimuth'], relative_airmass, return_components=True) expected = pd.DataFrame(np.array( [[ 0. , 31.46046871, np.nan, 45.45539877], [ 0. , 26.84138589, np.nan, 31.72696071], [ 0. , 0. , np.nan, 4.47966439], [ 0. , 4.62212181, np.nan, 9.25316454]]).T, columns=['sky_diffuse', 'isotropic', 'circumsolar', 'horizon'], index=irrad_data.index ) if pandas_0_22(): expected_for_sum = expected['sky_diffuse'].copy() expected_for_sum.iloc[2] = 0 else: expected_for_sum = expected['sky_diffuse'] sum_components = out.iloc[:, 1:].sum(axis=1) sum_components.name = 'sky_diffuse' assert_frame_equal(out, expected, check_less_precise=2) assert_series_equal(sum_components, expected_for_sum, check_less_precise=2)
def test_resample_to_quarterly(self): for month in MONTHS: ts = _simple_pts('1990', '1992', freq='A-%s' % month) quar_ts = ts.resample('Q-%s' % month, fill_method='ffill') stamps = ts.to_timestamp('D', how='start') qdates = period_range(ts.index[0].asfreq('D', 'start'), ts.index[-1].asfreq('D', 'end'), freq='Q-%s' % month) expected = stamps.reindex(qdates.to_timestamp('D', 'e'), method='ffill') expected.index = qdates assert_series_equal(quar_ts, expected) # conforms, but different month ts = _simple_pts('1990', '1992', freq='A-JUN') for how in ['start', 'end']: result = ts.resample('Q-MAR', convention=how, fill_method='ffill') expected = ts.asfreq('Q-MAR', how=how) expected = expected.reindex(result.index, method='ffill') # .to_timestamp('D') # expected = expected.resample('Q-MAR', fill_method='ffill') assert_series_equal(result, expected)
def test_numpy_reduction(self): result = self.ts.resample('A', how='prod', closed='right') expected = self.ts.groupby(lambda x: x.year).agg(np.prod) expected.index = result.index assert_series_equal(result, expected)
def test_astype_unicode(self): # GH7758 # a bit of magic is required to set default encoding encoding to utf-8 digits = string.digits test_series = [ Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), Series([u('データーサイエンス、お前はもう死んでいる')]), ] former_encoding = None if not compat.PY3: # in python we can force the default encoding for this test former_encoding = sys.getdefaultencoding() reload(sys) # noqa sys.setdefaultencoding("utf-8") if sys.getdefaultencoding() == "utf-8": test_series.append(Series([u('野菜食べないとやばい').encode("utf-8")])) for s in test_series: res = s.astype("unicode") expec = s.map(compat.text_type) assert_series_equal(res, expec) # restore the former encoding if former_encoding is not None and former_encoding != "utf-8": reload(sys) # noqa sys.setdefaultencoding(former_encoding)
def test_conversions(data_missing): # astype to object series df = pd.DataFrame({'A': data_missing}) result = df['A'].astype('object') expected = pd.Series(np.array([np.nan, 1], dtype=object), name='A') tm.assert_series_equal(result, expected) # convert to object ndarray # we assert that we are exactly equal # including type conversions of scalars result = df['A'].astype('object').values expected = np.array([np.nan, 1], dtype=object) tm.assert_numpy_array_equal(result, expected) for r, e in zip(result, expected): if pd.isnull(r): assert pd.isnull(e) elif is_integer(r): # PY2 can be int or long assert r == e assert is_integer(e) else: assert r == e assert type(r) == type(e)
def assert_series_equal(left, right, check_names=True, **kwargs): """Backwards compatibility wrapper for ``pandas.util.testing.assert_series_equal`` Examples -------- >>> import pandas as pd >>> s = pd.Series(list('abc'), name='a') >>> s2 = pd.Series(list('abc'), name='b') >>> assert_series_equal(s, s2) # doctest: +ELLIPSIS Traceback (most recent call last): ... AssertionError: ... >>> assert_series_equal(s, s2, check_names=False) See Also -------- pandas.util.testing.assert_series_equal """ try: return tm.assert_series_equal(left, right, check_names=check_names, **kwargs) except TypeError: if check_names: assert left.name == right.name return tm.assert_series_equal(left, right, **kwargs)
def test_duplicate_dates_indexing(dups): ts = dups uniques = ts.index.unique() for date in uniques: result = ts[date] mask = ts.index == date total = (ts.index == date).sum() expected = ts[mask] if total > 1: assert_series_equal(result, expected) else: assert_almost_equal(result, expected[0]) cp = ts.copy() cp[date] = 0 expected = Series(np.where(mask, 0, ts), index=ts.index) assert_series_equal(cp, expected) pytest.raises(KeyError, ts.__getitem__, datetime(2000, 1, 6)) # new index ts[datetime(2000, 1, 6)] = 0 assert ts[datetime(2000, 1, 6)] == 0
def test_upsample_with_limit(self): rng = date_range('1/1/2000', periods=3, freq='5t') ts = Series(np.random.randn(len(rng)), rng) result = ts.resample('t', fill_method='ffill', limit=2) expected = ts.reindex(result.index, method='ffill', limit=2) assert_series_equal(result, expected)
def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame): frame_copy = float_frame.reindex(float_frame.index[::2]) del frame_copy['D'] frame_copy['C'][:5] = np.nan added = float_frame + frame_copy indexer = added['A'].dropna().index exp = (float_frame['A'] * 2).copy() tm.assert_series_equal(added['A'].dropna(), exp.loc[indexer]) exp.loc[~exp.index.isin(indexer)] = np.nan tm.assert_series_equal(added['A'], exp.loc[added['A'].index]) assert np.isnan(added['C'].reindex(frame_copy.index)[:5]).all() # assert(False) assert np.isnan(added['D']).all() self_added = float_frame + float_frame tm.assert_index_equal(self_added.index, float_frame.index) added_rev = frame_copy + float_frame assert np.isnan(added['D']).all() assert np.isnan(added_rev['D']).all() # corner cases # empty plus_empty = float_frame + DataFrame() assert np.isnan(plus_empty.values).all() empty_plus = DataFrame() + float_frame assert np.isnan(empty_plus.values).all() empty_empty = DataFrame() + DataFrame() assert empty_empty.empty # out of order reverse = float_frame.reindex(columns=float_frame.columns[::-1]) assert_frame_equal(reverse + float_frame, float_frame * 2) # mix vs float64, upcast added = float_frame + mixed_float_frame _check_mixed_float(added, dtype='float64') added = mixed_float_frame + float_frame _check_mixed_float(added, dtype='float64') # mix vs mix added = mixed_float_frame + mixed_float_frame _check_mixed_float(added, dtype=dict(C=None)) # with int added = float_frame + mixed_int_frame _check_mixed_float(added, dtype='float64')
def test_resample_5minute(self): rng = period_range('1/1/2000', '1/5/2000', freq='T') ts = TimeSeries(np.random.randn(len(rng)), index=rng) result = ts.resample('5min') expected = ts.to_timestamp().resample('5min') assert_series_equal(result, expected)
def test_Component_conversion(self): comp = Component(np.arange(5)) series = pd.Series(np.arange(5)) assert_series_equal(series, comp.to_series())
def test_pickle_strings(self, string_series): unp_series = self._pickle_roundtrip(string_series) tm.assert_series_equal(unp_series, string_series)
def test_pickle_datetimes(self, datetime_series): unp_ts = self._pickle_roundtrip(datetime_series) tm.assert_series_equal(unp_ts, datetime_series)
def test_percentiles(): dates = [ date(2019, 1, 1), date(2019, 1, 2), date(2019, 1, 3), date(2019, 1, 4), date(2019, 1, 7), date(2019, 1, 8), ] x = pd.Series([3.0, 2.0, 3.0, 1.0, 3.0, 6.0], index=dates) y = pd.Series([3.5, 1.8, 2.9, 1.2, 3.1, 6.0], index=dates) assert_series_equal(percentiles(pd.Series([]), y), pd.Series([])) assert_series_equal(percentiles(x, pd.Series([])), pd.Series()) assert_series_equal(percentiles(x, y, Window(7, 0)), pd.Series()) result = percentiles(x, y, 2) expected = pd.Series([50.0, 50.0, 100.0, 75.0], index=dates[2:]) assert_series_equal(result, expected, obj="percentiles with window length 2") result = percentiles(x, y, Window(2, 0)) expected = pd.Series([100.0, 0.0, 50.0, 50.0, 100.0, 75.0], index=dates) assert_series_equal(result, expected, obj="percentiles with window 2 and ramp 0") result = percentiles(x, y, Window('1w', 0)) expected = pd.Series([100.0, 0.0, 33.333333, 25.0, 100.0, 90.0], index=dates) assert_series_equal(result, expected, obj="percentiles with window 1w") result = percentiles(x, y, Window('1w', '3d')) expected = pd.Series([25.0, 100.0, 90.0], index=dates[3:]) assert_series_equal(result, expected, obj="percentiles with window 1w and ramp 3d") result = percentiles(x) expected = pd.Series([50.0, 25.0, 66.667, 12.500, 70.0, 91.667], index=dates) assert_series_equal(result, expected, obj="percentiles over historical values", check_less_precise=True) result = percentiles(x, y) expected = pd.Series([100.0, 0.0, 33.333, 25.0, 100.0, 91.667], index=dates) assert_series_equal(result, expected, obj="percentiles without window length", check_less_precise=True) with pytest.raises(ValueError): percentiles(x, pd.Series(), Window(6, 1))
def test_zscores(): assert_series_equal(zscores(pd.Series()), pd.Series()) assert_series_equal(zscores(pd.Series(), 1), pd.Series()) assert_series_equal(zscores(pd.Series([1])), pd.Series([0.0])) assert_series_equal(zscores(pd.Series([1]), Window(1, 0)), pd.Series([0.0])) dates = [ date(2019, 1, 1), date(2019, 1, 2), date(2019, 1, 3), date(2019, 1, 4), date(2019, 1, 7), date(2019, 1, 8), ] x = pd.Series([3.0, 2.0, 3.0, 1.0, 3.0, 6.0], index=dates) result = zscores(x) expected = pd.Series([0.000000, -0.597614, 0.000000, -1.195229, 0.000000, 1.792843], index=dates) assert_series_equal(result, expected, obj="z-score", check_less_precise=True) assert_series_equal(result, (x - x.mean()) / x.std(), obj="full series zscore") result = zscores(x, Window(2, 0)) expected = pd.Series([0.0, -0.707107, 0.707107, -0.707107, 0.707107, 0.707107], index=dates) assert_series_equal(result, expected, obj="z-score window 2", check_less_precise=True) assert_series_equal(zscores(x, Window(5, 5)), zscores(x, 5)) result = zscores(x, Window('1w', 0)) expected = pd.Series([0.0, -0.707106, 0.577350, -1.305582, 0.670820, 1.603567], index=dates) assert_series_equal(result, expected, obj="z-score window 1w", check_less_precise=True)
def test_CoordinateComponent_conversion(self): d = Data(x=[1, 2, 3]) series = pd.Series(np.array([0, 1, 2])) comp = d.get_component(d.pixel_component_ids[0]) assert_series_equal(series, comp.to_series())
def test_CategoricalComponent_conversion(self): comp = CategoricalComponent(np.array(['a', 'b', 'c', 'd'])) series = pd.Series(['a', 'b', 'c', 'd']) assert_series_equal(series, comp.to_series())
def test_ops_series_object(self): # GH 13043 s = pd.Series([ pd.Timestamp('2015-01-01', tz='US/Eastern'), pd.Timestamp('2015-01-01', tz='Asia/Tokyo') ], name='xxx') self.assertEqual(s.dtype, object) exp = pd.Series([ pd.Timestamp('2015-01-02', tz='US/Eastern'), pd.Timestamp('2015-01-02', tz='Asia/Tokyo') ], name='xxx') tm.assert_series_equal(s + pd.Timedelta('1 days'), exp) tm.assert_series_equal(pd.Timedelta('1 days') + s, exp) # object series & object series s2 = pd.Series([ pd.Timestamp('2015-01-03', tz='US/Eastern'), pd.Timestamp('2015-01-05', tz='Asia/Tokyo') ], name='xxx') self.assertEqual(s2.dtype, object) exp = pd.Series([pd.Timedelta('2 days'), pd.Timedelta('4 days')], name='xxx') tm.assert_series_equal(s2 - s, exp) tm.assert_series_equal(s - s2, -exp) s = pd.Series([pd.Timedelta('01:00:00'), pd.Timedelta('02:00:00')], name='xxx', dtype=object) self.assertEqual(s.dtype, object) exp = pd.Series([pd.Timedelta('01:30:00'), pd.Timedelta('02:30:00')], name='xxx') tm.assert_series_equal(s + pd.Timedelta('00:30:00'), exp) tm.assert_series_equal(pd.Timedelta('00:30:00') + s, exp)
def test_timedelta_ops_with_missing_values(self): # setup s1 = pd.to_timedelta(Series(['00:00:01'])) s2 = pd.to_timedelta(Series(['00:00:02'])) sn = pd.to_timedelta(Series([pd.NaT])) df1 = DataFrame(['00:00:01']).apply(pd.to_timedelta) df2 = DataFrame(['00:00:02']).apply(pd.to_timedelta) dfn = DataFrame([pd.NaT]).apply(pd.to_timedelta) scalar1 = pd.to_timedelta('00:00:01') scalar2 = pd.to_timedelta('00:00:02') timedelta_NaT = pd.to_timedelta('NaT') NA = np.nan actual = scalar1 + scalar1 self.assertEqual(actual, scalar2) actual = scalar2 - scalar1 self.assertEqual(actual, scalar1) actual = s1 + s1 assert_series_equal(actual, s2) actual = s2 - s1 assert_series_equal(actual, s1) actual = s1 + scalar1 assert_series_equal(actual, s2) actual = scalar1 + s1 assert_series_equal(actual, s2) actual = s2 - scalar1 assert_series_equal(actual, s1) actual = -scalar1 + s2 assert_series_equal(actual, s1) actual = s1 + timedelta_NaT assert_series_equal(actual, sn) actual = timedelta_NaT + s1 assert_series_equal(actual, sn) actual = s1 - timedelta_NaT assert_series_equal(actual, sn) actual = -timedelta_NaT + s1 assert_series_equal(actual, sn) actual = s1 + NA assert_series_equal(actual, sn) actual = NA + s1 assert_series_equal(actual, sn) actual = s1 - NA assert_series_equal(actual, sn) actual = -NA + s1 assert_series_equal(actual, sn) actual = s1 + pd.NaT assert_series_equal(actual, sn) actual = s2 - pd.NaT assert_series_equal(actual, sn) actual = s1 + df1 assert_frame_equal(actual, df2) actual = s2 - df1 assert_frame_equal(actual, df1) actual = df1 + s1 assert_frame_equal(actual, df2) actual = df2 - s1 assert_frame_equal(actual, df1) actual = df1 + df1 assert_frame_equal(actual, df2) actual = df2 - df1 assert_frame_equal(actual, df1) actual = df1 + scalar1 assert_frame_equal(actual, df2) actual = df2 - scalar1 assert_frame_equal(actual, df1) actual = df1 + timedelta_NaT assert_frame_equal(actual, dfn) actual = df1 - timedelta_NaT assert_frame_equal(actual, dfn) actual = df1 + NA assert_frame_equal(actual, dfn) actual = df1 - NA assert_frame_equal(actual, dfn) actual = df1 + pd.NaT # NaT is datetime, not timedelta assert_frame_equal(actual, dfn) actual = df1 - pd.NaT assert_frame_equal(actual, dfn)
def test_timedelta_ops_with_missing_values(self): # setup s1 = pd.to_timedelta(Series(['00:00:01'])) s2 = pd.to_timedelta(Series(['00:00:02'])) sn = pd.to_timedelta(Series([pd.NaT])) df1 = pd.DataFrame(['00:00:01']).apply(pd.to_timedelta) df2 = pd.DataFrame(['00:00:02']).apply(pd.to_timedelta) dfn = pd.DataFrame([pd.NaT]).apply(pd.to_timedelta) scalar1 = pd.to_timedelta('00:00:01') scalar2 = pd.to_timedelta('00:00:02') timedelta_NaT = pd.to_timedelta('NaT') actual = scalar1 + scalar1 assert actual == scalar2 actual = scalar2 - scalar1 assert actual == scalar1 actual = s1 + s1 tm.assert_series_equal(actual, s2) actual = s2 - s1 tm.assert_series_equal(actual, s1) actual = s1 + scalar1 tm.assert_series_equal(actual, s2) actual = scalar1 + s1 tm.assert_series_equal(actual, s2) actual = s2 - scalar1 tm.assert_series_equal(actual, s1) actual = -scalar1 + s2 tm.assert_series_equal(actual, s1) actual = s1 + timedelta_NaT tm.assert_series_equal(actual, sn) actual = timedelta_NaT + s1 tm.assert_series_equal(actual, sn) actual = s1 - timedelta_NaT tm.assert_series_equal(actual, sn) actual = -timedelta_NaT + s1 tm.assert_series_equal(actual, sn) with pytest.raises(TypeError): s1 + np.nan with pytest.raises(TypeError): np.nan + s1 with pytest.raises(TypeError): s1 - np.nan with pytest.raises(TypeError): -np.nan + s1 actual = s1 + pd.NaT tm.assert_series_equal(actual, sn) actual = s2 - pd.NaT tm.assert_series_equal(actual, sn) actual = s1 + df1 tm.assert_frame_equal(actual, df2) actual = s2 - df1 tm.assert_frame_equal(actual, df1) actual = df1 + s1 tm.assert_frame_equal(actual, df2) actual = df2 - s1 tm.assert_frame_equal(actual, df1) actual = df1 + df1 tm.assert_frame_equal(actual, df2) actual = df2 - df1 tm.assert_frame_equal(actual, df1) actual = df1 + scalar1 tm.assert_frame_equal(actual, df2) actual = df2 - scalar1 tm.assert_frame_equal(actual, df1) actual = df1 + timedelta_NaT tm.assert_frame_equal(actual, dfn) actual = df1 - timedelta_NaT tm.assert_frame_equal(actual, dfn) with pytest.raises(TypeError): df1 + np.nan with pytest.raises(TypeError): df1 - np.nan actual = df1 + pd.NaT # NaT is datetime, not timedelta tm.assert_frame_equal(actual, dfn) actual = df1 - pd.NaT tm.assert_frame_equal(actual, dfn)
def test_grouper_creation_bug(self): # GH 8795 df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]}) g = df.groupby('A') expected = g.sum() g = df.groupby(pd.Grouper(key='A')) result = g.sum() assert_frame_equal(result, expected) result = g.apply(lambda x: x.sum()) assert_frame_equal(result, expected) g = df.groupby(pd.Grouper(key='A', axis=0)) result = g.sum() assert_frame_equal(result, expected) # GH14334 # pd.Grouper(key=...) may be passed in a list df = DataFrame({ 'A': [0, 0, 0, 1, 1, 1], 'B': [1, 1, 2, 2, 3, 3], 'C': [1, 2, 3, 4, 5, 6] }) # Group by single column expected = df.groupby('A').sum() g = df.groupby([pd.Grouper(key='A')]) result = g.sum() assert_frame_equal(result, expected) # Group by two columns # using a combination of strings and Grouper objects expected = df.groupby(['A', 'B']).sum() # Group with two Grouper objects g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')]) result = g.sum() assert_frame_equal(result, expected) # Group with a string and a Grouper object g = df.groupby(['A', pd.Grouper(key='B')]) result = g.sum() assert_frame_equal(result, expected) # Group with a Grouper object and a string g = df.groupby([pd.Grouper(key='A'), 'B']) result = g.sum() assert_frame_equal(result, expected) # GH8866 s = Series( np.arange(8, dtype='int64'), index=pd.MultiIndex.from_product( [list('ab'), range(2), date_range('20130101', periods=2)], names=['one', 'two', 'three'])) result = s.groupby(pd.Grouper(level='three', freq='M')).sum() expected = Series([28], index=Index([Timestamp('2013-01-31')], freq='M', name='three')) assert_series_equal(result, expected) # just specifying a level breaks result = s.groupby(pd.Grouper(level='one')).sum() expected = s.groupby(level='one').sum() assert_series_equal(result, expected)
def test_compare_timedelta_series(self): # regresssion test for GH5963 s = pd.Series([timedelta(days=1), timedelta(days=2)]) actual = s > timedelta(days=1) expected = pd.Series([False, True]) tm.assert_series_equal(actual, expected)
def test_predict(self): y = tm.makeTimeSeries() x = tm.makeTimeDataFrame() with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): model1 = ols(y=y, x=x) assert_series_equal(model1.predict(), model1.y_predict) assert_series_equal(model1.predict(x=x), model1.y_predict) assert_series_equal(model1.predict(beta=model1.beta), model1.y_predict) exog = x.copy() exog['intercept'] = 1. rs = Series(np.dot(exog.values, model1.beta.values), x.index) assert_series_equal(model1.y_predict, rs) x2 = x.reindex(columns=x.columns[::-1]) assert_series_equal(model1.predict(x=x2), model1.y_predict) x3 = x2 + 10 pred3 = model1.predict(x=x3) x3['intercept'] = 1. x3 = x3.reindex(columns=model1.beta.index) expected = Series(np.dot(x3.values, model1.beta.values), x3.index) assert_series_equal(expected, pred3) beta = Series(0., model1.beta.index) pred4 = model1.predict(beta=beta) assert_series_equal(Series(0., pred4.index), pred4)
def test_comparisons(self, data, reverse, base): cat_rev = Series( Categorical(data, categories=reverse, ordered=True)) cat_rev_base = Series( Categorical(base, categories=reverse, ordered=True)) cat = Series(Categorical(data, ordered=True)) cat_base = Series( Categorical(base, categories=cat.cat.categories, ordered=True)) s = Series(base) a = np.array(base) # comparisons need to take categories ordering into account res_rev = cat_rev > cat_rev_base exp_rev = Series([True, False, False]) tm.assert_series_equal(res_rev, exp_rev) res_rev = cat_rev < cat_rev_base exp_rev = Series([False, False, True]) tm.assert_series_equal(res_rev, exp_rev) res = cat > cat_base exp = Series([False, False, True]) tm.assert_series_equal(res, exp) scalar = base[1] res = cat > scalar exp = Series([False, False, True]) exp2 = cat.values > scalar tm.assert_series_equal(res, exp) tm.assert_numpy_array_equal(res.values, exp2) res_rev = cat_rev > scalar exp_rev = Series([True, False, False]) exp_rev2 = cat_rev.values > scalar tm.assert_series_equal(res_rev, exp_rev) tm.assert_numpy_array_equal(res_rev.values, exp_rev2) # Only categories with same categories can be compared with pytest.raises(TypeError): cat > cat_rev # categorical cannot be compared to Series or numpy array, and also # not the other way around msg = ("Cannot compare a Categorical for op __gt__ with type" r" <class 'numpy\.ndarray'>") with pytest.raises(TypeError, match=msg): cat > s with pytest.raises(TypeError, match=msg): cat_rev > s with pytest.raises(TypeError, match=msg): cat > a with pytest.raises(TypeError, match=msg): cat_rev > a with pytest.raises(TypeError, match=msg): s < cat with pytest.raises(TypeError, match=msg): s < cat_rev with pytest.raises(TypeError, match=msg): a < cat with pytest.raises(TypeError, match=msg): a < cat_rev
def test_set_index_cast_datetimeindex(self): df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], 'B': np.random.randn(1000)}) idf = df.set_index('A') assert isinstance(idf.index, pd.DatetimeIndex) # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 i = (pd.DatetimeIndex( to_datetime(['2013-1-1 13:00', '2013-1-2 14:00'], errors="raise")) .tz_localize('US/Pacific')) df = DataFrame(np.random.randn(2, 1), columns=['A']) expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')], dtype="object")) # convert index to series result = Series(i) assert_series_equal(result, expected) # assignt to frame df['B'] = i result = df['B'] assert_series_equal(result, expected, check_names=False) assert result.name == 'B' # keep the timezone result = i.to_series(keep_tz=True) assert_series_equal(result.reset_index(drop=True), expected) # convert to utc df['C'] = i.to_series().reset_index(drop=True) result = df['C'] comp = pd.DatetimeIndex(expected.values).copy() comp.tz = None tm.assert_numpy_array_equal(result.values, comp.values) # list of datetimes with a tz df['D'] = i.to_pydatetime() result = df['D'] assert_series_equal(result, expected, check_names=False) assert result.name == 'D' # GH 6785 # set the index manually import pytz df = DataFrame( [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}]) expected = df.set_index('ts') df.index = df['ts'] df.pop('ts') assert_frame_equal(df, expected) # GH 3950 # reset_index with single level for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']: idx = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz, name='idx') df = pd.DataFrame( {'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) expected = pd.DataFrame({'idx': [datetime(2011, 1, 1), datetime(2011, 1, 2), datetime(2011, 1, 3), datetime(2011, 1, 4), datetime(2011, 1, 5)], 'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, columns=['idx', 'a', 'b']) expected['idx'] = expected['idx'].apply( lambda d: pd.Timestamp(d, tz=tz)) assert_frame_equal(df.reset_index(), expected)
def _compare_fullsample_ols(model1, model2): assert_series_equal(model1.beta, model2.beta)
def test_sort_values(self): # check indexes are reordered corresponding with the values ser = Series([3, 2, 4, 1], ['A', 'B', 'C', 'D']) expected = Series([1, 2, 3, 4], ['D', 'B', 'A', 'C']) result = ser.sort_values() tm.assert_series_equal(expected, result) ts = self.ts.copy() ts[:5] = np.NaN vals = ts.values result = ts.sort_values() assert np.isnan(result[-5:]).all() tm.assert_numpy_array_equal(result[:-5].values, np.sort(vals[5:])) # na_position result = ts.sort_values(na_position='first') assert np.isnan(result[:5]).all() tm.assert_numpy_array_equal(result[5:].values, np.sort(vals[5:])) # something object-type ser = Series(['A', 'B'], [1, 2]) # no failure ser.sort_values() # ascending=False ordered = ts.sort_values(ascending=False) expected = np.sort(ts.valid().values)[::-1] assert_almost_equal(expected, ordered.valid().values) ordered = ts.sort_values(ascending=False, na_position='first') assert_almost_equal(expected, ordered.valid().values) # ascending=[False] should behave the same as ascending=False ordered = ts.sort_values(ascending=[False]) expected = ts.sort_values(ascending=False) assert_series_equal(expected, ordered) ordered = ts.sort_values(ascending=[False], na_position='first') expected = ts.sort_values(ascending=False, na_position='first') assert_series_equal(expected, ordered) pytest.raises(ValueError, lambda: ts.sort_values(ascending=None)) pytest.raises(ValueError, lambda: ts.sort_values(ascending=[])) pytest.raises(ValueError, lambda: ts.sort_values(ascending=[1, 2, 3])) pytest.raises(ValueError, lambda: ts.sort_values(ascending=[False, False])) pytest.raises(ValueError, lambda: ts.sort_values(ascending='foobar')) # inplace=True ts = self.ts.copy() ts.sort_values(ascending=False, inplace=True) tm.assert_series_equal(ts, self.ts.sort_values(ascending=False)) tm.assert_index_equal(ts.index, self.ts.sort_values(ascending=False).index) # GH 5856/5853 # Series.sort_values operating on a view df = DataFrame(np.random.randn(10, 4)) s = df.iloc[:, 0] def f(): s.sort_values(inplace=True) pytest.raises(ValueError, f)
def test_reset_index(self): stacked = self.frame.stack()[::2] stacked = DataFrame({'foo': stacked, 'bar': stacked}) names = ['first', 'second'] stacked.index.names = names deleveled = stacked.reset_index() for i, (lev, lab) in enumerate(zip(stacked.index.levels, stacked.index.labels)): values = lev.take(lab) name = names[i] tm.assert_index_equal(values, Index(deleveled[name])) stacked.index.names = [None, None] deleveled2 = stacked.reset_index() tm.assert_series_equal(deleveled['first'], deleveled2['level_0'], check_names=False) tm.assert_series_equal(deleveled['second'], deleveled2['level_1'], check_names=False) # default name assigned rdf = self.frame.reset_index() exp = pd.Series(self.frame.index.values, name='index') tm.assert_series_equal(rdf['index'], exp) # default name assigned, corner case df = self.frame.copy() df['index'] = 'foo' rdf = df.reset_index() exp = pd.Series(self.frame.index.values, name='level_0') tm.assert_series_equal(rdf['level_0'], exp) # but this is ok self.frame.index.name = 'index' deleveled = self.frame.reset_index() tm.assert_series_equal(deleveled['index'], pd.Series(self.frame.index)) tm.assert_index_equal(deleveled.index, pd.Index(np.arange(len(deleveled)))) # preserve column names self.frame.columns.name = 'columns' resetted = self.frame.reset_index() assert resetted.columns.name == 'columns' # only remove certain columns frame = self.frame.reset_index().set_index(['index', 'A', 'B']) rs = frame.reset_index(['A', 'B']) # TODO should reset_index check_names ? assert_frame_equal(rs, self.frame, check_names=False) rs = frame.reset_index(['index', 'A', 'B']) assert_frame_equal(rs, self.frame.reset_index(), check_names=False) rs = frame.reset_index(['index', 'A', 'B']) assert_frame_equal(rs, self.frame.reset_index(), check_names=False) rs = frame.reset_index('A') xp = self.frame.reset_index().set_index(['index', 'B']) assert_frame_equal(rs, xp, check_names=False) # test resetting in place df = self.frame.copy() resetted = self.frame.reset_index() df.reset_index(inplace=True) assert_frame_equal(df, resetted, check_names=False) frame = self.frame.reset_index().set_index(['index', 'A', 'B']) rs = frame.reset_index('A', drop=True) xp = self.frame.copy() del xp['A'] xp = xp.set_index(['B'], append=True) assert_frame_equal(rs, xp, check_names=False)
def test_where_scalar(t, df, cond, expected_func): expr = ibis.where(cond, t['plain_int64'], 3.0) result = expr.execute() expected = expected_func(df) tm.assert_series_equal(result, expected)
def test_fillna(): result = pd.Series(mpd.MoneyArray([1, 0], 'USD')).fillna(method='ffill') expected = pd.Series(mpd.MoneyArray([1, 1], 'USD')) tm.assert_series_equal(result, expected)
def test_notin(t, df, elements): expr = t.plain_float64.notin(elements) expected = ~df.plain_float64.isin(elements) result = expr.execute() tm.assert_series_equal(result, expected)
def test_shift_on_column(n, column, sql): t = symbol('t', discover(sql)) expr = t[column].shift(n) result = odo(compute(expr, sql), pd.Series) expected = odo(sql, pd.DataFrame)[column].shift(n) tm.assert_series_equal(result, expected)
def test_notnull(t, df): expr = t.strings_with_nulls.notnull() result = expr.execute() expected = df.strings_with_nulls.notnull() tm.assert_series_equal(result, expected)
def test_left_binary_op(t, df, op, args): expr = op(*args(t.float64_with_zeros)) result = expr.execute() expected = op(*args(df.float64_with_zeros)) tm.assert_series_equal(result, expected)
def test_series_limit(t, df, offset): n = 5 s_expr = t.plain_int64.limit(n, offset=offset) result = s_expr.execute() tm.assert_series_equal(result, df.plain_int64.iloc[offset:offset + n])
def test_scalar_parameter(t, df, raw_value): value = ibis.param(dt.double) expr = t.float64_with_zeros == value result = expr.execute(params={value: raw_value}) expected = df.float64_with_zeros == raw_value tm.assert_series_equal(result, expected)
def test_table_column(t, df): expr = t.plain_int64 result = expr.execute() expected = df.plain_int64 tm.assert_series_equal(result, expected)
def test_distinct(t, df): expr = t.dup_strings.distinct() result = expr.execute() expected = pd.Series(df.dup_strings.unique(), name='dup_strings') tm.assert_series_equal(result, expected)
def test_mixed_timezone_series_ops_object(self): # GH#13043 ser = pd.Series( [ pd.Timestamp("2015-01-01", tz="US/Eastern"), pd.Timestamp("2015-01-01", tz="Asia/Tokyo"), ], name="xxx", ) assert ser.dtype == object exp = pd.Series( [ pd.Timestamp("2015-01-02", tz="US/Eastern"), pd.Timestamp("2015-01-02", tz="Asia/Tokyo"), ], name="xxx", ) tm.assert_series_equal(ser + pd.Timedelta("1 days"), exp) tm.assert_series_equal(pd.Timedelta("1 days") + ser, exp) # object series & object series ser2 = pd.Series( [ pd.Timestamp("2015-01-03", tz="US/Eastern"), pd.Timestamp("2015-01-05", tz="Asia/Tokyo"), ], name="xxx", ) assert ser2.dtype == object exp = pd.Series([pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx") tm.assert_series_equal(ser2 - ser, exp) tm.assert_series_equal(ser - ser2, -exp) ser = pd.Series( [pd.Timedelta("01:00:00"), pd.Timedelta("02:00:00")], name="xxx", dtype=object, ) assert ser.dtype == object exp = pd.Series([pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")], name="xxx") tm.assert_series_equal(ser + pd.Timedelta("00:30:00"), exp) tm.assert_series_equal(pd.Timedelta("00:30:00") + ser, exp)
def test_null_if_zero(t, df, column): expr = t[column].nullifzero() result = expr.execute() expected = df[column].replace(0, np.nan) tm.assert_series_equal(result, expected)