# 打开911文件 filepath = './PM2.5/BeijingPM20100101_20151231.csv' df = pd.read_csv(filepath) print(df.head()) print('*' * 100) print(df.index) print('*' * 100) # print(df.info()) print('*' * 100) # PeriodIndex用于格式化时间序列,按小时生成时间序列 # 把分开的时间字符串转换为Pandas转换为Pandas的时间序列类型 period = pd.PeriodIndex(year=df['year'], month=df['month'], day=df['day'], hour=df['hour'], freq='H') print(period) print(type(period)) print('*' * 100) # 首先:将上面的时间序列加入到df数组中,设置为一个列索引 df['datetime'] = period print(df.head()) print('*' * 100) # 然后:将上面的datetime列索引设置为index索引(行索引) df.set_index('datetime', inplace=True) print(df.head()) print('*' * 100)
def test_is_period(self): assert lib.is_period(pd.Period('2011-01', freq='M')) assert not lib.is_period(pd.PeriodIndex(['2011-01'], freq='M')) assert not lib.is_period(pd.Timestamp('2011-01')) assert not lib.is_period(1) assert not lib.is_period(np.nan)
def test_isfinite_pandas_period_index_nat(self): daily = pd.date_range('2017-1-1', '2017-1-3', freq='D').to_period('D') daily = pd.PeriodIndex(list(daily) + [pd.NaT]) self.assertEqual(isfinite(daily), np.array([True, True, True, False]))
# coding=utf-8 import pandas as pd from matplotlib import pyplot as plt file_path = "./BeijingPM20100101_20151231.csv" df = pd.read_csv(file_path) #把分开的时间字符串通过periodIndex的方法转化为pandas的时间类型 period = pd.PeriodIndex(year=df["year"],month=df["month"],day=df["day"],hour=df["hour"],freq="H") df["datetime"] = period # print(df.head(10)) #把datetime 设置为索引 df.set_index("datetime",inplace=True) #进行降采样 df = df.resample("7D").mean() print(df.head()) #处理缺失数据,删除缺失数据 # print(df["PM_US Post"]) data =df["PM_US Post"] data_china = df["PM_Nongzhanguan"] print(data_china.head(100)) #画图 _x = data.index _x = [i.strftime("%Y%m%d") for i in _x] _x_china = [i.strftime("%Y%m%d") for i in data_china.index] print(len(_x_china),len(_x_china)) _y = data.values
import numpy as np path = '~/data/' # Load all the data Client = pd.read_excel(path+'Client_info_20191118.xlsx',sheet_name='Sheet1') TPA = pd.read_excel(path+'Copy_of_TPA_cash.xlsx',sheet_name='Sheet1') Customer = pd.read_csv(path +'Customer_Data_Request_20191121.csv') Perf = pd.read_csv(path +'Performance_Data_Request_20191204.csv') # Client data columns selected to merge Client_cols = ['Active', 'Client ID', 'Account Name', 'Industry', 'NAICS Code', 'NAICS Description', 'Broker', 'TPA_x', 'Launch Date', 'Termination Date', 'Affiliate/Fed Gov?', 'Cash'] Client = Client.merge(TPA,how='left',left_on='Client ID',right_on='HOST ID') Customer = Customer.merge(Client[Client_cols],how='left',left_on='Unique_Company_ID',right_on='Client ID') Perf = Perf.merge(Customer, how='left',on='Unique_Customer_ID') # Data Preprocessing of Perf Perf['Year_and_Month'] = pd.to_datetime(Perf['Year_and_Month'].astype(str), format = '%Y%m') Perf['YQ'] = pd.PeriodIndex(Perf['Year_and_Month'], freq='Q')
def test_get_loc(self): # GH 17717 p0 = pd.Period("2017-09-01") p1 = pd.Period("2017-09-02") p2 = pd.Period("2017-09-03") # get the location of p1/p2 from # monotonic increasing PeriodIndex with non-duplicate idx0 = pd.PeriodIndex([p0, p1, p2]) expected_idx1_p1 = 1 expected_idx1_p2 = 2 assert idx0.get_loc(p1) == expected_idx1_p1 assert idx0.get_loc(str(p1)) == expected_idx1_p1 assert idx0.get_loc(p2) == expected_idx1_p2 assert idx0.get_loc(str(p2)) == expected_idx1_p2 msg = "Cannot interpret 'foo' as period" with pytest.raises(KeyError, match=msg): idx0.get_loc("foo") with pytest.raises(KeyError, match=r"^1\.1$"): idx0.get_loc(1.1) msg = (r"'PeriodIndex\(\['2017-09-01', '2017-09-02', '2017-09-03'\]," r" dtype='period\[D\]', freq='D'\)' is an invalid key") with pytest.raises(TypeError, match=msg): idx0.get_loc(idx0) # get the location of p1/p2 from # monotonic increasing PeriodIndex with duplicate idx1 = pd.PeriodIndex([p1, p1, p2]) expected_idx1_p1 = slice(0, 2) expected_idx1_p2 = 2 assert idx1.get_loc(p1) == expected_idx1_p1 assert idx1.get_loc(str(p1)) == expected_idx1_p1 assert idx1.get_loc(p2) == expected_idx1_p2 assert idx1.get_loc(str(p2)) == expected_idx1_p2 msg = "Cannot interpret 'foo' as period" with pytest.raises(KeyError, match=msg): idx1.get_loc("foo") with pytest.raises(KeyError, match=r"^1\.1$"): idx1.get_loc(1.1) msg = (r"'PeriodIndex\(\['2017-09-02', '2017-09-02', '2017-09-03'\]," r" dtype='period\[D\]', freq='D'\)' is an invalid key") with pytest.raises(TypeError, match=msg): idx1.get_loc(idx1) # get the location of p1/p2 from # non-monotonic increasing/decreasing PeriodIndex with duplicate idx2 = pd.PeriodIndex([p2, p1, p2]) expected_idx2_p1 = 1 expected_idx2_p2 = np.array([True, False, True]) assert idx2.get_loc(p1) == expected_idx2_p1 assert idx2.get_loc(str(p1)) == expected_idx2_p1 tm.assert_numpy_array_equal(idx2.get_loc(p2), expected_idx2_p2) tm.assert_numpy_array_equal(idx2.get_loc(str(p2)), expected_idx2_p2)
def __init__(self, data=None, origin=None, development=None, columns=None, index=None, origin_format=None, development_format=None, cumulative=None, *args, **kwargs): if data is None: ' Instance with nothing set' return # Sanitize inputs index, columns, origin, development = self._str_to_list( index, columns, origin, development) key_gr = origin + self._flatten(development, index) # Aggregate data data_agg = data.groupby(key_gr).sum().reset_index() if not index: index = ['Total'] data_agg[index[0]] = 'Total' # Initialize origin and development dates and grains origin_date = TriangleBase._to_datetime( data_agg, origin, format=origin_format) self.origin_grain = TriangleBase._get_grain(origin_date) m_cnt = {'Y': 12, 'Q': 3, 'M': 1} if development: development_date = TriangleBase._to_datetime( data_agg, development, period_end=True, format=development_format) self.development_grain = TriangleBase._get_grain(development_date) col = 'development' else: development_date = origin_date + \ pd.tseries.offsets.MonthEnd(m_cnt[self.origin_grain]) self.development_grain = self.origin_grain col = None # Prep the data for 4D Triangle origin_date = pd.PeriodIndex(origin_date, freq=self.origin_grain).to_timestamp() data_agg = self._get_axes(data_agg, index, columns, origin_date, development_date) data_agg = pd.pivot_table(data_agg, index=index+['origin'], columns=col, values=columns, aggfunc='sum') # Assign object properties self.kdims = np.array(np.array(data_agg.index.droplevel(-1).unique()).tolist()) self.odims = np.array(data_agg.index.levels[-1].unique()) if development: self.ddims = np.array(data_agg.columns.levels[-1].unique()) self.ddims = self.ddims*(m_cnt [self.development_grain]) self.vdims = np.array(data_agg.columns.levels[0].unique()) else: self.ddims = np.array([None]) self.vdims = np.array(data_agg.columns.unique()) self.valuation_date = development_date.max() self.key_labels = index self._set_slicers() # Create 4D Triangle triangle = \ np.reshape(np.array(data_agg), (len(self.kdims), len(self.odims), len(self.vdims), len(self.ddims))) triangle = np.swapaxes(triangle, 1, 2) # Set all 0s to NAN for nansafe ufunc arithmetic triangle[triangle == 0] = np.nan self.values = np.array(triangle, dtype=kwargs.get('dtype', None)) # Used to show NANs in lower part of triangle self.nan_override = False self.valuation = self._valuation_triangle() self.is_cumulative = cumulative
def compute_short_term_reversal(): resids = read_risk_data("residuals") srisk = read_risk_data("srisk") sig = (-resids / srisk).ewm(span=5, min_periods=2).mean() sig.index = pd.PeriodIndex(sig.index, freq="B") return rank_signal(sig)
def read_risk_alphas(): alphas = pd.read_csv(f"{ALPHA_DIR}/risk_alphas") alphas = alphas.set_index("date") alphas.index = pd.PeriodIndex(alphas.index, freq="B", name="date") return alphas[alphas.abs().max(axis=1) > 0]
def test_meta_nonempty_index(): idx = pd.RangeIndex(1, name="foo") res = meta_nonempty(idx) assert type(res) is pd.RangeIndex assert res.name == idx.name idx = pd.Int64Index([1], name="foo") res = meta_nonempty(idx) assert type(res) is pd.Int64Index assert res.name == idx.name idx = pd.Index(["a"], name="foo") res = meta_nonempty(idx) assert type(res) is pd.Index assert res.name == idx.name idx = pd.DatetimeIndex(["1970-01-01"], freq="d", tz="America/New_York", name="foo") res = meta_nonempty(idx) assert type(res) is pd.DatetimeIndex assert res.tz == idx.tz assert res.freq == idx.freq assert res.name == idx.name idx = pd.PeriodIndex(["1970-01-01"], freq="d", name="foo") res = meta_nonempty(idx) assert type(res) is pd.PeriodIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.TimedeltaIndex([np.timedelta64(1, "D")], freq="d", name="foo") res = meta_nonempty(idx) assert type(res) is pd.TimedeltaIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.CategoricalIndex(["xyx"], ["xyx", "zzz"], ordered=True, name="foo") res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert (res.categories == idx.categories).all() assert res.ordered == idx.ordered assert res.name == idx.name idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES], ordered=True, name="foo") res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert res.ordered == idx.ordered assert res.name == idx.name levels = [pd.Int64Index([1], name="a"), pd.Float64Index([1.0], name="b")] codes = [[0], [0]] idx = pd.MultiIndex(levels=levels, names=["a", "b"], codes=codes) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names levels = [ pd.Int64Index([1], name="a"), pd.CategoricalIndex(data=["xyx"], categories=["xyx"], name="b"), pd.TimedeltaIndex([np.timedelta64(1, "D")], name="timedelta"), ] codes = [[0], [0], [0]] idx = pd.MultiIndex(levels=levels, names=["a", "b", "timedelta"], codes=codes) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names
def _datetime_index_to_period(self, index: pd.DatetimeIndex) -> pd.PeriodIndex: if index.freq is None: return pd.PeriodIndex(index, freq=self.freq) else: return pd.PeriodIndex(index)
# Example 5.2: Pair Trading AUD.CAD with Rollover Interests import numpy as np import pandas as pd #import matplotlib.pyplot as plt #import statsmodels.formula.api as sm #import statsmodels.tsa.stattools as ts #import statsmodels.tsa.vector_ar.vecm as vm df=pd.read_csv('inputData_AUDCAD_20120426.csv') #df['Date']=pd.to_datetime(df['Date'], format='%Y%m%d').dt.date # remove HH:MM:SS df['Date']=pd.to_datetime(df['Date'], format='%Y%m%d') df.set_index('Date', inplace=True) aud=pd.read_csv('AUD_interestRate.csv') audindex=pd.PeriodIndex(year=aud.Year, month=aud.Month, freq='M') #aud.index=audindex.to_timestamp().date aud.index=audindex.to_timestamp() cad=pd.read_csv('CAD_interestRate.csv') cadindex=pd.PeriodIndex(year=cad.Year, month=cad.Month, freq='M') #cad.index=cadindex.to_timestamp().date cad.index=cadindex.to_timestamp() df=pd.merge(df, aud, how='outer', left_index=True, right_index=True) df.drop({'Year', 'Month'}, axis=1, inplace=True) df.rename({'Rates': 'AUD_Rates'}, axis=1, inplace=True) df=pd.merge(df, cad, how='outer', left_index=True, right_index=True) df.drop({'Year', 'Month'}, axis=1, inplace=True) df.rename({'Rates': 'CAD_Rates'}, axis=1, inplace=True)
# # # 时期及其算数运算 # # # # 时期(period)表示的时时间区间,biru数日,数月,数季,数年等.period类所表示的就是这种数据类型,其构造函数需要用到一个字符串或整数 p = pd.Period(2007, freq='A-DEC') # # # # 这个Period对象表示的是从2007年1月1日到2007年12月31日之间的整段时间.只需对Period对象加上或减去一个整数即可达到根据其频率进行位移的效果 # print(p + 5) # # # # 如果两个period对象拥有相同的频率,则它们的查就是它们之间的单位数量 # print(pd.Period('2014', freq='A-DEC') - p) # # # # period_range函数可用于创建规则的时期范围 rng = pd.period_range('1/1/2000', '6/30/2000', freq='M') # print(rng) # # # # PeriodIndex类保存了一组Period,它可以在任何pandas数据结构中被用作轴索引 # print(Series(np.random.randn(6),index=rng)) # # # # PeriodIndex类的构造函数还允许直接使用一组字符串 values = ['2001Q3', '2002Q2', '2003Q1'] index = pd.PeriodIndex(values, freq='Q-DEC') # print(index) # # # 时期的频率转换 # # # # Period和PeriodIndex对象都可以通过其asfreq方法被转换成别的频率. p = pd.Period('2007', freq='A-DEC') # print(p.asfreq('M',how='start')) # print(p.asfreq('M',how='end')) p = pd.Period('2007', freq='A-JUN') # print(p.asfreq('M', 'start')) # print(p.asfreq('M', 'end')) # # # # 在将高频率转换为低频率时,超时期是由子时期所属的位置决定的 p = pd.Period('2007-08', 'M') # print(p.asfreq('A-JUN')) # # # # PeriodIndex或TimeSeries的频率转换方式也是如此 rng = pd.period_range('2006', '2009', freq='A-DEC')
import numpy as np import pandas as pd import matplotlib.pyplot as plt #CrimeDataDayLevel data = pd.read_csv('CrimeDataDayLevel.csv', parse_dates=[ 'BeginDate', 'ReportedDate', 'EnteredDate', 'lastchanged', 'LastUpdateDate' ]) data['CrimeDate'] = data['BeginDate'].dt.date data['Year'] = data['BeginDate'].dt.year data['Year'] = data['Year'].astype('str') data['Year-Month'] = pd.PeriodIndex( data['BeginDate'], freq='M') #to represent year-month of the crime data['Month'] = data['BeginDate'].dt.month data['Weekday'] = data['BeginDate'].dt.weekday # 0 is Monday data['Hour'] = data['BeginDate'].dt.hour #categorizing offence types segment_data = pd.read_csv('Segment_data.csv') data = pd.merge(data, segment_data, on="Offense") #loading weekday names week_data = pd.read_csv('Week_data.csv') data = pd.merge(data, week_data, on="Weekday") #categorizing parts of a day time_data = pd.read_csv('Time_data.csv')
def test_constructor_floats(self, floats): with pytest.raises(TypeError): pd.PeriodIndex._simple_new(floats, freq='M') with pytest.raises(TypeError): pd.PeriodIndex(floats, freq='M')
def origin(self, value): self._len_check(self.origin, value) value = pd.PeriodIndex([item for item in list(value)], freq=self.origin_grain).to_timestamp() self.odims = value.values
def test_is_period_deprecated(): with tm.assert_produces_warning(FutureWarning): assert not com.is_period([1, 2, 3]) assert not com.is_period(pd.Index([1, 2, 3])) assert com.is_period(pd.PeriodIndex(["2017-01-01"], freq="D"))
def grain(self, grain="", trailing=False, inplace=False): """Changes the grain of a cumulative triangle. Parameters ---------- grain : str The grain to which you want your triangle converted, specified as 'OXDY' where X and Y can take on values of ``['Y', 'Q', 'M' ]`` For example, 'OYDY' for Origin Year/Development Year, 'OQDM' for Origin quarter/Development Month, etc. trailing : bool For partial years/quarters, trailing will set the year/quarter end to that of the latest available from the data. inplace : bool Whether to mutate the existing Triangle instance or return a new one. Returns ------- Triangle """ ograin_old, ograin_new = self.origin_grain, grain[1:2] dgrain_old, dgrain_new = self.development_grain, grain[-1] valid = {"Y": ["Y"], "Q": ["Q", "Y"], "M": ["Y", "Q", "M"]} if ograin_new not in valid.get( ograin_old, []) or dgrain_new not in valid.get(dgrain_old, []): raise ValueError("New grain not compatible with existing grain") if (self.is_cumulative is None and dgrain_old != dgrain_new and self.shape[-1] > 1): raise AttributeError( "The is_cumulative attribute must be set before using grain method." ) if valid["M"].index(ograin_new) > valid["M"].index(dgrain_new): raise ValueError( "Origin grain must be coarser than development grain") obj = self.dev_to_val() if ograin_new != ograin_old: if trailing: mn = self.origin[-1].strftime( "%b").upper() if trailing else "DEC" freq = "Q-" if ograin_new == "Q" else "A-" o = pd.PeriodIndex(self.origin, freq=freq + mn) o = np.array(o.to_timestamp(how="s")) else: freq = "%YQ%q" if ograin_new == "Q" else "%Y" o = pd.to_datetime(self.origin.strftime(freq)).values values = [ getattr(obj.loc[..., i, :], "sum")(2, auto_sparse=False, keepdims=True) for i in self.origin.groupby(o).values() ] obj = concat(values, axis=2, ignore_index=True) obj.odims = np.unique(o) obj.origin_grain = ograin_new if len(obj.ddims) > 1 and pd.Timestamp(obj.odims[0]).strftime( "%Y%m") != obj.valuation[0].strftime("%Y%m"): addl_ts = (pd.period_range( obj.odims[0], obj.valuation[0], freq="M")[:-1].to_timestamp().values) addl = obj.iloc[..., -len(addl_ts):] * 0 addl.ddims = addl_ts obj = concat((addl, obj), axis=-1) if dgrain_old != dgrain_new and obj.shape[-1] > 1: step = self._dstep()[dgrain_old][dgrain_new] d = np.sort( len(obj.development) - np.arange(0, len(obj.development), step) - 1) if obj.is_cumulative: obj = obj.iloc[..., d] else: ddims = obj.ddims[d] d2 = [d[0]] * (d[0] + 1) + list( np.repeat(np.array(d[1:]), step)) values = [ getattr(obj.iloc[..., i], "sum")(3, auto_sparse=False, keepdims=True) for i in obj.development.groupby(d2).groups.values() ] obj = concat(values, axis=3, ignore_index=True) obj.ddims = ddims obj.development_grain = dgrain_new obj = obj.dev_to_val() if self.is_val_tri else obj.val_to_dev() if inplace: self = obj return self return obj
assert s.dtype == 'Period[M]' for res, exp in zip(s, vals): assert isinstance(res, pd.Period) assert res.freq == 'M' assert res == exp @pytest.mark.parametrize( 'array, expected_type, dtype', [ (np.array([0, 1], dtype=np.int64), np.ndarray, 'int64'), (np.array(['a', 'b']), np.ndarray, 'object'), (pd.Categorical(['a', 'b']), pd.Categorical, 'category'), (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), DatetimeArray, 'datetime64[ns, US/Central]'), (pd.PeriodIndex([2018, 2019], freq='A'), pd.core.arrays.PeriodArray, pd.core.dtypes.dtypes.PeriodDtype("A-DEC")), (pd.IntervalIndex.from_breaks( [0, 1, 2]), pd.core.arrays.IntervalArray, 'interval'), # This test is currently failing for datetime64[ns] and timedelta64[ns]. # The NumPy type system is sufficient for representing these types, so # we just use NumPy for Series / DataFrame columns of these types (so # we get consolidation and so on). # However, DatetimeIndex and TimedeltaIndex use the DateLikeArray # abstraction to for code reuse. # At the moment, we've judged that allowing this test to fail is more # practical that overriding Series._values to special case # Series[M8[ns]] and Series[m8[ns]] to return a DateLikeArray. pytest.param( pd.DatetimeIndex(['2017', '2018']),
def model(params, series, exogen, yearly_seasonality, alpha=None, beta=None, gamma=None, omega=None, epsilon=None, smoothing=None): """ This function runs an ETS(M,Ad,M) model with exogen variables. This is an Error, Trend, Seasonality exponential smoothing model.The first M stands for multiplicative or relative errors, the Ad for an additive dampend trend and the last M for multiplicative seasonality. The model also contains additional exogen variables which are dummies for certain events. The actual computation of the fit model is done in the function ETS_M_Ad_M which further contains the functions calc_new_estimates, calc_error, save_estimates and seasonal_matrices. These are all explained in the following code. Parameters: params: model parameters series: the time series in a pandas Series format exog: the exogen variables in a pandas DataFrame format with each column being a variable and the time as its index Return: The function returns the sum of squared error of the fitted model. This allows the model to be inputed into an optimizer which minimizes the sum of squared residuals dependent on the input parameters (params). """ # defining all model parameters from the params vector # Note that the seasonal and exogen variable parameters are vectors while the other parameters are scalars if smoothing: alpha = alpha beta = beta gamma = gamma omega = omega epsilon = epsilon level_initial = params[0] slope_initial = params[1] seasonal_initial = params[ 2: 9] #prior we hade a np.vstack around this and this broke it down i think # added len(exogen) as now we have variable number of exogen variables due to days before and after reg = (params[9:9 + len(exogen.columns)]) else: alpha = params[0] beta = params[1] gamma = params[2] omega = params[3] level_initial = params[4] slope_initial = params[5] seasonal_initial = params[6:13] #added len(exogen) as now we have variable number of exogen variables due to days before and after reg = (params[13:13 + len(exogen.columns)]) #defining the initial yearly seasonal components as a fourier series if yearly_seasonality == "fourier": # defining the index as a date variable which will become relevant for subsequent computation yearly = pd.DataFrame({'date': series.index}) yearly = yearly.set_index(pd.PeriodIndex(series.index, freq='D')) # yearly seasonality with N=10 fourier series elements # N=1 yearly['yearly_sin365'] = np.sin(2 * np.pi * yearly.index.dayofyear / 365.25) yearly['yearly_cos365'] = np.cos(2 * np.pi * yearly.index.dayofyear / 365.25) # N=2 yearly['yearly_sin365_2'] = np.sin(4 * np.pi * yearly.index.dayofyear / 365.25) yearly['yearly_cos365_2'] = np.cos(4 * np.pi * yearly.index.dayofyear / 365.25) # N=3 yearly['yearly_sin365_3'] = np.sin(6 * np.pi * yearly.index.dayofyear / 365.25) yearly['yearly_cos365_3'] = np.cos(6 * np.pi * yearly.index.dayofyear / 365.25) # N=4 yearly['yearly_sin365_4'] = np.sin(8 * np.pi * yearly.index.dayofyear / 365.25) yearly['yearly_cos365_4'] = np.cos(8 * np.pi * yearly.index.dayofyear / 365.25) # N=5 yearly['yearly_sin365_5'] = np.sin(10 * np.pi * yearly.index.dayofyear / 365.25) yearly['yearly_cos365_5'] = np.cos(10 * np.pi * yearly.index.dayofyear / 365.25) # N=6 yearly['yearly_sin365_6'] = np.sin(12 * np.pi * yearly.index.dayofyear / 365.25) yearly['yearly_cos365_6'] = np.cos(12 * np.pi * yearly.index.dayofyear / 365.25) # N=7 yearly['yearly_sin365_7'] = np.sin(14 * np.pi * yearly.index.dayofyear / 365.25) yearly['yearly_cos365_7'] = np.cos(14 * np.pi * yearly.index.dayofyear / 365.25) # N=8 yearly['yearly_sin365_8'] = np.sin(16 * np.pi * yearly.index.dayofyear / 365.25) yearly['yearly_cos365_8'] = np.cos(16 * np.pi * yearly.index.dayofyear / 365.25) # N=9 yearly['yearly_sin365_9'] = np.sin(18 * np.pi * yearly.index.dayofyear / 365.25) yearly['yearly_cos365_9'] = np.cos(18 * np.pi * yearly.index.dayofyear / 365.25) # N=10 yearly['yearly_sin365_10'] = np.sin(20 * np.pi * yearly.index.dayofyear / 365.25) yearly['yearly_cos365_10'] = np.cos(20 * np.pi * yearly.index.dayofyear / 365.25) # deleting date column as it is no longer required and should not be in the linear regression del yearly['date'] if smoothing: # 1. compute the fourier series results from the weights times the cos(t) and sin(t) for t=0...365 yearly_init = params[9 + len(exogen.columns):29 + len(exogen.columns)] * yearly.iloc[0:365] # 2. sum up the total yearly seasonality of each day by summing up all weighted trigonometric functional values yearly_init = 1 + yearly_init.sum(axis=1) # 3. define this array of 365 dummies as an array yearly_init = yearly_init #np.vstack(yearly_init) # 4. turn the array around as we want the most recent seasonality effect to be at the end yearly_init = yearly_init[::-1] # yearly smoothing parameter epsilon = epsilon else: # 1. compute the fourier series results from the weights times the cos(t) and sin(t) for t=0...365 yearly_init = params[13 + len(exogen.columns):33 + len(exogen.columns)] * yearly.iloc[0:365] # 2. sum up the total yearly seasonality of each day by summing up all weighted trigonometric functional values yearly_init = 1 + yearly_init.sum(axis=1) # 3. define this array of 365 dummies as an array yearly_init = yearly_init #np.vstack(yearly_init) # 4. turn the array around as we want the most recent seasonality effect to be at the end yearly_init = yearly_init[::-1] # yearly smoothing parameter epsilon = params[33 + len(exogen.columns)] #Built in exception that gives out the parameters and the error sum if an error in the model occurs #Note that the exception gives back yearly seasonality parameters if they are specified in the model #For the dummy model we have 12 dummies and a smoothing parameter elif yearly_seasonality == "dummies": if smoothing: yearly_init = (params[9 + len(exogen.columns):21 + len(exogen.columns)]) epsilon = epsilon else: yearly_init = (params[13 + len(exogen.columns):25 + len(exogen.columns)]) epsilon = params[25 + len(exogen.columns)] try: if yearly_seasonality == "fourier" or yearly_seasonality == 'dummies': results = ETS_M_Ad_M(alpha, beta, gamma, omega, level_initial, slope_initial, seasonal_initial, reg, series, exogen, yearly_seasonality, yearly_init, epsilon) else: results = ETS_M_Ad_M(alpha, beta, gamma, omega, level_initial, slope_initial, seasonal_initial, reg, series, exogen, yearly_seasonality, yearly_init=None, epsilon=None) except: if yearly_seasonality == "fourier": print('alpha:', alpha, 'beta:', beta, 'gamma:', gamma, 'omega:', omega, level_initial, slope_initial, seasonal_initial, 'reg:', reg, 'Fourier weights:', params[13 + len(exogen.columns):33 + len(exogen.columns)], 'epsilon:', params[33 + len(exogen.columns)]) if error_sum: print('error_sum:', error_sum) if yearly_seasonality == "dummies": print('alpha:', alpha, 'beta:', beta, 'gamma:', gamma, 'omega:', omega, level_initial, slope_initial, seasonal_initial, 'reg:', reg, 'monthly dummies:', params[13 + len(exogen.columns):25 + len(exogen.columns)], 'epsilon:', params[25 + len(exogen.columns)]) if error_sum: print('error_sum:', error_sum) else: print('alpha:', alpha, 'beta:', beta, 'gamma:', gamma, 'omega:', omega, level_initial, slope_initial, seasonal_initial, 'reg:', reg) if error_sum: print('error_sum:', error_sum) error_list = results['errors_list'] error_list = [number**2 for number in error_list] error_sum = sum(error_list) return error_sum
def scrape(self, metrics: Set[str], country_filter: Optional[List[str]] = None, region_filter: Optional[List[str]] = None): columns = { 'idx': deque(), 'date': deque(), 'country': deque(), 'country_region': deque(), 'region': deque(), # 'series': deque(), # 'weekday': deque(), # 'value': deque() } for metric_name in metrics: assert metric_name in self.metrics columns[metric_name] = deque() for idx, info in enumerate(self.download()): file_name, date = info with open(file_name, 'rb') as file: data: Dict[str] = json.load(file) transformed_data = deque([ { 'country': 'Ukraine', 'regions': data['ukraine'] } ]) for country_data in data['world']: if country_data['country'] == 'Ukraine': continue all_region_data = {'region': {'label': {'en': 'all'}}} all_region_data.update(country_data) transformed_data.append( { 'country': country_data['country'], 'regions': [all_region_data] } ) # del datasets for country_data in transformed_data: country_name = country_data['country'] if country_filter and country_name not in country_filter: continue for region_data in country_data['regions']: columns['idx'].append(idx) columns['date'].append(date) columns['country'].append(country_name) columns['region'].append(region_data['label']['en']) columns['country_region'].append(f"{country_name}_{region_data['label']['en']}") for metric_name in metrics: columns[metric_name].append(float(region_data[metric_name])) df = pd.DataFrame.from_dict(columns) df.date = pd.to_datetime(df.date) df.date.index = pd.PeriodIndex(df.date, freq="D", name="Period") df.country = df.country.astype('category') df.region = df.region.astype('category') df.country_region = df.country_region.astype('category') df['country_cat'] = df.country.cat.codes df['region_cat'] = df.region.cat.codes df['country_region_cat'] = df.country_region.cat.codes for idx, day_name in enumerate(calendar.day_name): df[day_name] = df['date'].apply( lambda x: 1. if x.day_name() == day_name else .0) logger.info(f"Dataset range: {df['date'].min()} - {df['date'].max()}") self._dataframe = df if region_filter is not None: self._dataframe = self._dataframe[self._dataframe.region.isin(region_filter)] return self._dataframe
assert res == exp @pytest.mark.parametrize( "array, expected_type, dtype", [ (np.array([0, 1], dtype=np.int64), np.ndarray, "int64"), (np.array(["a", "b"]), np.ndarray, "object"), (pd.Categorical(["a", "b"]), pd.Categorical, "category"), ( pd.DatetimeIndex(["2017", "2018"], tz="US/Central"), DatetimeArray, "datetime64[ns, US/Central]", ), ( pd.PeriodIndex([2018, 2019], freq="A"), PeriodArray, pd.core.dtypes.dtypes.PeriodDtype("A-DEC"), ), (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval"), # This test is currently failing for datetime64[ns] and timedelta64[ns]. # The NumPy type system is sufficient for representing these types, so # we just use NumPy for Series / DataFrame columns of these types (so # we get consolidation and so on). # However, DatetimeIndex and TimedeltaIndex use the DateLikeArray # abstraction to for code reuse. # At the moment, we've judged that allowing this test to fail is more # practical that overriding Series._values to special case # Series[M8[ns]] and Series[m8[ns]] to return a DateLikeArray. pytest.param( pd.DatetimeIndex(["2017", "2018"]),
def test_is_period(self): self.assertTrue(lib.is_period(pd.Period('2011-01', freq='M'))) self.assertFalse(lib.is_period(pd.PeriodIndex(['2011-01'], freq='M'))) self.assertFalse(lib.is_period(pd.Timestamp('2011-01'))) self.assertFalse(lib.is_period(1)) self.assertFalse(lib.is_period(np.nan))
def test_eq(other): idx = pd.PeriodIndex(['2017', '2017', '2018'], freq="D") expected = np.array([True, True, False]) result = idx == other tm.assert_numpy_array_equal(result, expected)
}, columns=pd.Index(['left', 'right'], name='side')) df df.unstack('state') # In[ ]: df.unstack('state').stack('side') # ### Pivoting “Long” to “Wide” Format # In[ ]: data = pd.read_csv('examples/macrodata.csv') data.head() periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date') columns = pd.Index(['realgdp', 'infl', 'unemp'], name='item') data = data.reindex(columns=columns) data.index = periods.to_timestamp('D', 'end') ldata = data.stack().reset_index().rename(columns={0: 'value'}) # In[ ]: ldata[:10] # In[ ]: pivoted = ldata.pivot('date', 'item', 'value') pivoted # In[ ]:
def test_meta_nonempty_index(): idx = pd.RangeIndex(1, name='foo') res = meta_nonempty(idx) assert type(res) is pd.RangeIndex assert res.name == idx.name idx = pd.Int64Index([1], name='foo') res = meta_nonempty(idx) assert type(res) is pd.Int64Index assert res.name == idx.name idx = pd.Index(['a'], name='foo') res = meta_nonempty(idx) assert type(res) is pd.Index assert res.name == idx.name idx = pd.DatetimeIndex(['1970-01-01'], freq='d', tz='America/New_York', name='foo') res = meta_nonempty(idx) assert type(res) is pd.DatetimeIndex assert res.tz == idx.tz assert res.freq == idx.freq assert res.name == idx.name idx = pd.PeriodIndex(['1970-01-01'], freq='d', name='foo') res = meta_nonempty(idx) assert type(res) is pd.PeriodIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.TimedeltaIndex([np.timedelta64(1, 'D')], freq='d', name='foo') res = meta_nonempty(idx) assert type(res) is pd.TimedeltaIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.CategoricalIndex(['xyx'], ['xyx', 'zzz'], ordered=True, name='foo') res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert (res.categories == idx.categories).all() assert res.ordered == idx.ordered assert res.name == idx.name idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES], ordered=True, name='foo') res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert res.ordered == idx.ordered assert res.name == idx.name levels = [pd.Int64Index([1], name='a'), pd.Float64Index([1.0], name='b')] idx = pd.MultiIndex(levels=levels, labels=[[0], [0]], names=['a', 'b']) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names levels = [pd.Int64Index([1], name='a'), pd.CategoricalIndex(data=['xyx'], categories=['xyx'], name='b'), pd.TimedeltaIndex([np.timedelta64(1, 'D')], name='timedelta')] idx = pd.MultiIndex(levels=levels, labels=[[0], [0], [0]], names=['a', 'b', 'timedelta']) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names
def test_is_period_arraylike(): assert not com.is_period_arraylike([1, 2, 3]) assert not com.is_period_arraylike(pd.Index([1, 2, 3])) assert com.is_period_arraylike(pd.PeriodIndex(["2017-01-01"], freq="D"))
def test_is_period(self): assert lib.is_period(pd.Period("2011-01", freq="M")) assert not lib.is_period(pd.PeriodIndex(["2011-01"], freq="M")) assert not lib.is_period(pd.Timestamp("2011-01")) assert not lib.is_period(1) assert not lib.is_period(np.nan)
'All causes, by age (years), All Ages**':'all', 'All causes, by age (years), LT 1':'LT 1', 'All causes, by age (years), 1–24':'1-24', 'All causes, by age (years), 25–44':'25-44', 'All causes, by age (years), 45–64':'45-64', 'All causes, by age (years), ≥65':'65+', 'P&I† Total':'total'}, inplace = True) #totals in regions (quarters) regions = deaths.loc[0:8, "area"].tolist() deaths = deaths[deaths["area"].isin(regions)] deaths["date"] = pd.to_datetime(deaths.year.astype(str), format='%Y') + \ pd.to_timedelta((deaths.week-1).mul(7).astype(str) + ' days') deaths_Q = deaths.groupby(["area", pd.PeriodIndex(deaths.date, freq='Q')])["all"].sum() deaths_Q = deaths_Q.unstack().rename(columns = { "2016Q1" : "2016Q1", "2016Q2" : "2016Q2", "2016Q3" : "2016Q3", }) deaths_Q.plot(kind='bar', stacked=True, rot=90, edgecolor='black') plt.xlabel('area') plt.ylabel('deaths') plt.tight_layout(pad=0., w_pad=-16.5, h_pad=0.0) deaths_Q.T.plot(kind='bar', stacked=True, rot=0, edgecolor='black') plt.xlabel('area') plt.ylabel('deaths')
rng = pd.period_range('2000-01-01', '2000-06-30', freq='M') # sequence of periods within the range; PeriodIndex period.asfreq('M', how='start') # convert Period into another frequency ts = pd.Series(np.random.randn(len(rng)), index=rng) ts.asfreq('M', how='start') # convert a PeriodIndex p = pd.Period('2012Q4', freq='Q-JAN') p4pm = (p.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60 # second-to-last business day of the quarter rng = p4pm.to_timestamp() # convert to Timestamp rng = pd.date_range('2000-01-02', periods=3, freq='M') ts = pd.Series(np.random.randn(3), index=rng) ts.to_period() # return back data = pd.read_csv('./data/macrodata.csv') # having a year and quarter attributes index = pd.PeriodIndex(year=data.year, quarter=data.quarter, freq='Q-DEC') # combine to form an index data.index = index # assign it print(data.infl) # Resampling and Frequency Conversion rng = pd.date_range('2000-01-01', periods=12, freq='T') ts = pd.Series(np.arange(12), index=rng) ts.resample('5min', closed='right', label='left', loffset='-1s').sum() # a groupby and aggregation ts.resample('5min').ohlc() # compute open, high, low, close index=pd.date_range('1/1/2000', periods=2, freq='W-WED') frame = pd.DataFrame(np.random.randn(2, 4), index=index, columns=['Colorado', 'Texas', 'New York', 'Ohio']) frame.resample('D').asfreq() # no aggregation frame.resample('D').ffill(limit=2) # fulfill the NaN frame = pd.DataFrame(np.random.randn(24, 4), index=pd.period_range('1-2000', '12-2001', freq='M'),