# 打开911文件
filepath = './PM2.5/BeijingPM20100101_20151231.csv'
df = pd.read_csv(filepath)
print(df.head())
print('*' * 100)
print(df.index)
print('*' * 100)
# print(df.info())
print('*' * 100)

# PeriodIndex用于格式化时间序列,按小时生成时间序列
# 把分开的时间字符串转换为Pandas转换为Pandas的时间序列类型
period = pd.PeriodIndex(year=df['year'],
                        month=df['month'],
                        day=df['day'],
                        hour=df['hour'],
                        freq='H')
print(period)
print(type(period))
print('*' * 100)

# 首先:将上面的时间序列加入到df数组中,设置为一个列索引
df['datetime'] = period
print(df.head())
print('*' * 100)

# 然后:将上面的datetime列索引设置为index索引(行索引)
df.set_index('datetime', inplace=True)
print(df.head())
print('*' * 100)
Exemple #2
0
 def test_is_period(self):
     assert lib.is_period(pd.Period('2011-01', freq='M'))
     assert not lib.is_period(pd.PeriodIndex(['2011-01'], freq='M'))
     assert not lib.is_period(pd.Timestamp('2011-01'))
     assert not lib.is_period(1)
     assert not lib.is_period(np.nan)
Exemple #3
0
 def test_isfinite_pandas_period_index_nat(self):
     daily = pd.date_range('2017-1-1', '2017-1-3', freq='D').to_period('D')
     daily = pd.PeriodIndex(list(daily) + [pd.NaT])
     self.assertEqual(isfinite(daily), np.array([True, True, True, False]))
Exemple #4
0
# coding=utf-8
import pandas as pd
from matplotlib import  pyplot as plt
file_path = "./BeijingPM20100101_20151231.csv"

df = pd.read_csv(file_path)

#把分开的时间字符串通过periodIndex的方法转化为pandas的时间类型
period = pd.PeriodIndex(year=df["year"],month=df["month"],day=df["day"],hour=df["hour"],freq="H")
df["datetime"] = period
# print(df.head(10))

#把datetime 设置为索引
df.set_index("datetime",inplace=True)

#进行降采样
df = df.resample("7D").mean()
print(df.head())
#处理缺失数据,删除缺失数据
# print(df["PM_US Post"])

data  =df["PM_US Post"]
data_china = df["PM_Nongzhanguan"]

print(data_china.head(100))
#画图
_x = data.index
_x = [i.strftime("%Y%m%d") for i in _x]
_x_china = [i.strftime("%Y%m%d") for i in data_china.index]
print(len(_x_china),len(_x_china))
_y = data.values
Exemple #5
0
import numpy as np

path = '~/data/'
# Load all the data
Client = pd.read_excel(path+'Client_info_20191118.xlsx',sheet_name='Sheet1')
TPA = pd.read_excel(path+'Copy_of_TPA_cash.xlsx',sheet_name='Sheet1')
Customer = pd.read_csv(path +'Customer_Data_Request_20191121.csv')
Perf = pd.read_csv(path +'Performance_Data_Request_20191204.csv')

# Client data columns selected to merge
Client_cols = ['Active',
 'Client ID',
 'Account Name',
 'Industry',
 'NAICS Code',
 'NAICS Description',
 'Broker',
 'TPA_x',
 'Launch Date',
 'Termination Date',
 'Affiliate/Fed Gov?',
 'Cash']

Client = Client.merge(TPA,how='left',left_on='Client ID',right_on='HOST ID')
Customer = Customer.merge(Client[Client_cols],how='left',left_on='Unique_Company_ID',right_on='Client ID')
Perf = Perf.merge(Customer, how='left',on='Unique_Customer_ID')

# Data Preprocessing of Perf
Perf['Year_and_Month'] = pd.to_datetime(Perf['Year_and_Month'].astype(str), format = '%Y%m')
Perf['YQ'] = pd.PeriodIndex(Perf['Year_and_Month'], freq='Q')
Exemple #6
0
    def test_get_loc(self):
        # GH 17717
        p0 = pd.Period("2017-09-01")
        p1 = pd.Period("2017-09-02")
        p2 = pd.Period("2017-09-03")

        # get the location of p1/p2 from
        # monotonic increasing PeriodIndex with non-duplicate
        idx0 = pd.PeriodIndex([p0, p1, p2])
        expected_idx1_p1 = 1
        expected_idx1_p2 = 2

        assert idx0.get_loc(p1) == expected_idx1_p1
        assert idx0.get_loc(str(p1)) == expected_idx1_p1
        assert idx0.get_loc(p2) == expected_idx1_p2
        assert idx0.get_loc(str(p2)) == expected_idx1_p2

        msg = "Cannot interpret 'foo' as period"
        with pytest.raises(KeyError, match=msg):
            idx0.get_loc("foo")
        with pytest.raises(KeyError, match=r"^1\.1$"):
            idx0.get_loc(1.1)

        msg = (r"'PeriodIndex\(\['2017-09-01', '2017-09-02', '2017-09-03'\],"
               r" dtype='period\[D\]', freq='D'\)' is an invalid key")
        with pytest.raises(TypeError, match=msg):
            idx0.get_loc(idx0)

        # get the location of p1/p2 from
        # monotonic increasing PeriodIndex with duplicate
        idx1 = pd.PeriodIndex([p1, p1, p2])
        expected_idx1_p1 = slice(0, 2)
        expected_idx1_p2 = 2

        assert idx1.get_loc(p1) == expected_idx1_p1
        assert idx1.get_loc(str(p1)) == expected_idx1_p1
        assert idx1.get_loc(p2) == expected_idx1_p2
        assert idx1.get_loc(str(p2)) == expected_idx1_p2

        msg = "Cannot interpret 'foo' as period"
        with pytest.raises(KeyError, match=msg):
            idx1.get_loc("foo")

        with pytest.raises(KeyError, match=r"^1\.1$"):
            idx1.get_loc(1.1)

        msg = (r"'PeriodIndex\(\['2017-09-02', '2017-09-02', '2017-09-03'\],"
               r" dtype='period\[D\]', freq='D'\)' is an invalid key")
        with pytest.raises(TypeError, match=msg):
            idx1.get_loc(idx1)

        # get the location of p1/p2 from
        # non-monotonic increasing/decreasing PeriodIndex with duplicate
        idx2 = pd.PeriodIndex([p2, p1, p2])
        expected_idx2_p1 = 1
        expected_idx2_p2 = np.array([True, False, True])

        assert idx2.get_loc(p1) == expected_idx2_p1
        assert idx2.get_loc(str(p1)) == expected_idx2_p1
        tm.assert_numpy_array_equal(idx2.get_loc(p2), expected_idx2_p2)
        tm.assert_numpy_array_equal(idx2.get_loc(str(p2)), expected_idx2_p2)
Exemple #7
0
    def __init__(self, data=None, origin=None, development=None,
                 columns=None, index=None, origin_format=None,
                 development_format=None, cumulative=None, *args, **kwargs):
        if data is None:
            ' Instance with nothing set'
            return
        # Sanitize inputs
        index, columns, origin, development = self._str_to_list(
            index, columns, origin, development)
        key_gr = origin + self._flatten(development, index)
        # Aggregate data
        data_agg = data.groupby(key_gr).sum().reset_index()
        if not index:
            index = ['Total']
            data_agg[index[0]] = 'Total'
        # Initialize origin and development dates and grains
        origin_date = TriangleBase._to_datetime(
            data_agg, origin, format=origin_format)
        self.origin_grain = TriangleBase._get_grain(origin_date)
        m_cnt = {'Y': 12, 'Q': 3, 'M': 1}
        if development:
            development_date = TriangleBase._to_datetime(
                data_agg, development, period_end=True,
                format=development_format)
            self.development_grain = TriangleBase._get_grain(development_date)
            col = 'development'
        else:
            development_date = origin_date + \
                pd.tseries.offsets.MonthEnd(m_cnt[self.origin_grain])
            self.development_grain = self.origin_grain
            col = None
        # Prep the data for 4D Triangle
        origin_date = pd.PeriodIndex(origin_date, freq=self.origin_grain).to_timestamp()
        data_agg = self._get_axes(data_agg, index, columns,
                                  origin_date, development_date)
        data_agg = pd.pivot_table(data_agg, index=index+['origin'],
                                  columns=col, values=columns,
                                  aggfunc='sum')
        # Assign object properties
        self.kdims = np.array(np.array(data_agg.index.droplevel(-1).unique()).tolist())
        self.odims = np.array(data_agg.index.levels[-1].unique())
        if development:
            self.ddims = np.array(data_agg.columns.levels[-1].unique())
            self.ddims = self.ddims*(m_cnt
                                     [self.development_grain])
            self.vdims = np.array(data_agg.columns.levels[0].unique())
        else:
            self.ddims = np.array([None])
            self.vdims = np.array(data_agg.columns.unique())
        self.valuation_date = development_date.max()
        self.key_labels = index
        self._set_slicers()
        # Create 4D Triangle
        triangle = \
            np.reshape(np.array(data_agg), (len(self.kdims), len(self.odims),
                       len(self.vdims), len(self.ddims)))
        triangle = np.swapaxes(triangle, 1, 2)
        # Set all 0s to NAN for nansafe ufunc arithmetic
        triangle[triangle == 0] = np.nan

        self.values = np.array(triangle, dtype=kwargs.get('dtype', None))
        # Used to show NANs in lower part of triangle
        self.nan_override = False
        self.valuation = self._valuation_triangle()
        self.is_cumulative = cumulative
Exemple #8
0
def compute_short_term_reversal():
    resids = read_risk_data("residuals")
    srisk = read_risk_data("srisk")
    sig = (-resids / srisk).ewm(span=5, min_periods=2).mean()
    sig.index = pd.PeriodIndex(sig.index, freq="B")
    return rank_signal(sig)
Exemple #9
0
def read_risk_alphas():
    alphas = pd.read_csv(f"{ALPHA_DIR}/risk_alphas")
    alphas = alphas.set_index("date")
    alphas.index = pd.PeriodIndex(alphas.index, freq="B", name="date")
    return alphas[alphas.abs().max(axis=1) > 0]
Exemple #10
0
def test_meta_nonempty_index():
    idx = pd.RangeIndex(1, name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.RangeIndex
    assert res.name == idx.name

    idx = pd.Int64Index([1], name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.Int64Index
    assert res.name == idx.name

    idx = pd.Index(["a"], name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.Index
    assert res.name == idx.name

    idx = pd.DatetimeIndex(["1970-01-01"],
                           freq="d",
                           tz="America/New_York",
                           name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.DatetimeIndex
    assert res.tz == idx.tz
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.PeriodIndex(["1970-01-01"], freq="d", name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.PeriodIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.TimedeltaIndex([np.timedelta64(1, "D")], freq="d", name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.TimedeltaIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.CategoricalIndex(["xyx"], ["xyx", "zzz"],
                              ordered=True,
                              name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert (res.categories == idx.categories).all()
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES],
                              ordered=True,
                              name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    levels = [pd.Int64Index([1], name="a"), pd.Float64Index([1.0], name="b")]
    codes = [[0], [0]]
    idx = pd.MultiIndex(levels=levels, names=["a", "b"], codes=codes)
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names

    levels = [
        pd.Int64Index([1], name="a"),
        pd.CategoricalIndex(data=["xyx"], categories=["xyx"], name="b"),
        pd.TimedeltaIndex([np.timedelta64(1, "D")], name="timedelta"),
    ]

    codes = [[0], [0], [0]]

    idx = pd.MultiIndex(levels=levels,
                        names=["a", "b", "timedelta"],
                        codes=codes)
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names
Exemple #11
0
 def _datetime_index_to_period(self,
                               index: pd.DatetimeIndex) -> pd.PeriodIndex:
     if index.freq is None:
         return pd.PeriodIndex(index, freq=self.freq)
     else:
         return pd.PeriodIndex(index)
# Example 5.2: Pair Trading AUD.CAD with Rollover Interests

import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#import statsmodels.formula.api as sm
#import statsmodels.tsa.stattools as ts
#import statsmodels.tsa.vector_ar.vecm as vm

df=pd.read_csv('inputData_AUDCAD_20120426.csv')
#df['Date']=pd.to_datetime(df['Date'],  format='%Y%m%d').dt.date # remove HH:MM:SS
df['Date']=pd.to_datetime(df['Date'],  format='%Y%m%d')
df.set_index('Date', inplace=True)

aud=pd.read_csv('AUD_interestRate.csv')
audindex=pd.PeriodIndex(year=aud.Year, month=aud.Month, freq='M')
#aud.index=audindex.to_timestamp().date
aud.index=audindex.to_timestamp()

cad=pd.read_csv('CAD_interestRate.csv')
cadindex=pd.PeriodIndex(year=cad.Year, month=cad.Month, freq='M')
#cad.index=cadindex.to_timestamp().date
cad.index=cadindex.to_timestamp()

df=pd.merge(df, aud, how='outer', left_index=True, right_index=True)
df.drop({'Year', 'Month'}, axis=1, inplace=True)
df.rename({'Rates': 'AUD_Rates'}, axis=1, inplace=True)

df=pd.merge(df, cad, how='outer', left_index=True, right_index=True)
df.drop({'Year', 'Month'}, axis=1, inplace=True)
df.rename({'Rates': 'CAD_Rates'}, axis=1, inplace=True)
Exemple #13
0
    # # # 时期及其算数运算
    # # # # 时期(period)表示的时时间区间,biru数日,数月,数季,数年等.period类所表示的就是这种数据类型,其构造函数需要用到一个字符串或整数
    p = pd.Period(2007, freq='A-DEC')
    # # # # 这个Period对象表示的是从2007年1月1日到2007年12月31日之间的整段时间.只需对Period对象加上或减去一个整数即可达到根据其频率进行位移的效果
    # print(p + 5)
    # # # # 如果两个period对象拥有相同的频率,则它们的查就是它们之间的单位数量
    # print(pd.Period('2014', freq='A-DEC') - p)
    # # # # period_range函数可用于创建规则的时期范围
    rng = pd.period_range('1/1/2000', '6/30/2000', freq='M')
    # print(rng)
    # # # # PeriodIndex类保存了一组Period,它可以在任何pandas数据结构中被用作轴索引
    # print(Series(np.random.randn(6),index=rng))
    # # # # PeriodIndex类的构造函数还允许直接使用一组字符串
    values = ['2001Q3', '2002Q2', '2003Q1']
    index = pd.PeriodIndex(values, freq='Q-DEC')
    # print(index)

    # # # 时期的频率转换
    # # # # Period和PeriodIndex对象都可以通过其asfreq方法被转换成别的频率.
    p = pd.Period('2007', freq='A-DEC')
    # print(p.asfreq('M',how='start'))
    # print(p.asfreq('M',how='end'))
    p = pd.Period('2007', freq='A-JUN')
    # print(p.asfreq('M', 'start'))
    # print(p.asfreq('M', 'end'))
    # # # # 在将高频率转换为低频率时,超时期是由子时期所属的位置决定的
    p = pd.Period('2007-08', 'M')
    # print(p.asfreq('A-JUN'))
    # # # # PeriodIndex或TimeSeries的频率转换方式也是如此
    rng = pd.period_range('2006', '2009', freq='A-DEC')
Exemple #14
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#CrimeDataDayLevel
data = pd.read_csv('CrimeDataDayLevel.csv',
                   parse_dates=[
                       'BeginDate', 'ReportedDate', 'EnteredDate',
                       'lastchanged', 'LastUpdateDate'
                   ])

data['CrimeDate'] = data['BeginDate'].dt.date
data['Year'] = data['BeginDate'].dt.year
data['Year'] = data['Year'].astype('str')
data['Year-Month'] = pd.PeriodIndex(
    data['BeginDate'], freq='M')  #to represent year-month of the crime
data['Month'] = data['BeginDate'].dt.month

data['Weekday'] = data['BeginDate'].dt.weekday  # 0 is Monday
data['Hour'] = data['BeginDate'].dt.hour

#categorizing offence types
segment_data = pd.read_csv('Segment_data.csv')
data = pd.merge(data, segment_data, on="Offense")

#loading weekday names
week_data = pd.read_csv('Week_data.csv')
data = pd.merge(data, week_data, on="Weekday")

#categorizing parts of a day
time_data = pd.read_csv('Time_data.csv')
Exemple #15
0
    def test_constructor_floats(self, floats):
        with pytest.raises(TypeError):
            pd.PeriodIndex._simple_new(floats, freq='M')

        with pytest.raises(TypeError):
            pd.PeriodIndex(floats, freq='M')
 def origin(self, value):
     self._len_check(self.origin, value)
     value = pd.PeriodIndex([item for item in list(value)],
                            freq=self.origin_grain).to_timestamp()
     self.odims = value.values
Exemple #17
0
def test_is_period_deprecated():
    with tm.assert_produces_warning(FutureWarning):
        assert not com.is_period([1, 2, 3])
        assert not com.is_period(pd.Index([1, 2, 3]))
        assert com.is_period(pd.PeriodIndex(["2017-01-01"], freq="D"))
    def grain(self, grain="", trailing=False, inplace=False):
        """Changes the grain of a cumulative triangle.

        Parameters
        ----------
        grain : str
            The grain to which you want your triangle converted, specified as
            'OXDY' where X and Y can take on values of ``['Y', 'Q', 'M'
            ]`` For example, 'OYDY' for Origin Year/Development Year, 'OQDM'
            for Origin quarter/Development Month, etc.
        trailing : bool
            For partial years/quarters, trailing will set the year/quarter end to
            that of the latest available from the data.
        inplace : bool
            Whether to mutate the existing Triangle instance or return a new
            one.

        Returns
        -------
            Triangle
        """
        ograin_old, ograin_new = self.origin_grain, grain[1:2]
        dgrain_old, dgrain_new = self.development_grain, grain[-1]
        valid = {"Y": ["Y"], "Q": ["Q", "Y"], "M": ["Y", "Q", "M"]}
        if ograin_new not in valid.get(
                ograin_old, []) or dgrain_new not in valid.get(dgrain_old, []):
            raise ValueError("New grain not compatible with existing grain")
        if (self.is_cumulative is None and dgrain_old != dgrain_new
                and self.shape[-1] > 1):
            raise AttributeError(
                "The is_cumulative attribute must be set before using grain method."
            )
        if valid["M"].index(ograin_new) > valid["M"].index(dgrain_new):
            raise ValueError(
                "Origin grain must be coarser than development grain")
        obj = self.dev_to_val()
        if ograin_new != ograin_old:
            if trailing:
                mn = self.origin[-1].strftime(
                    "%b").upper() if trailing else "DEC"
                freq = "Q-" if ograin_new == "Q" else "A-"
                o = pd.PeriodIndex(self.origin, freq=freq + mn)
                o = np.array(o.to_timestamp(how="s"))
            else:
                freq = "%YQ%q" if ograin_new == "Q" else "%Y"
                o = pd.to_datetime(self.origin.strftime(freq)).values
            values = [
                getattr(obj.loc[..., i, :], "sum")(2,
                                                   auto_sparse=False,
                                                   keepdims=True)
                for i in self.origin.groupby(o).values()
            ]
            obj = concat(values, axis=2, ignore_index=True)
            obj.odims = np.unique(o)
            obj.origin_grain = ograin_new
            if len(obj.ddims) > 1 and pd.Timestamp(obj.odims[0]).strftime(
                    "%Y%m") != obj.valuation[0].strftime("%Y%m"):
                addl_ts = (pd.period_range(
                    obj.odims[0], obj.valuation[0],
                    freq="M")[:-1].to_timestamp().values)
                addl = obj.iloc[..., -len(addl_ts):] * 0
                addl.ddims = addl_ts
                obj = concat((addl, obj), axis=-1)
        if dgrain_old != dgrain_new and obj.shape[-1] > 1:
            step = self._dstep()[dgrain_old][dgrain_new]
            d = np.sort(
                len(obj.development) -
                np.arange(0, len(obj.development), step) - 1)
            if obj.is_cumulative:
                obj = obj.iloc[..., d]
            else:
                ddims = obj.ddims[d]
                d2 = [d[0]] * (d[0] + 1) + list(
                    np.repeat(np.array(d[1:]), step))
                values = [
                    getattr(obj.iloc[..., i], "sum")(3,
                                                     auto_sparse=False,
                                                     keepdims=True)
                    for i in obj.development.groupby(d2).groups.values()
                ]
                obj = concat(values, axis=3, ignore_index=True)
                obj.ddims = ddims
            obj.development_grain = dgrain_new
        obj = obj.dev_to_val() if self.is_val_tri else obj.val_to_dev()
        if inplace:
            self = obj
            return self
        return obj
Exemple #19
0
        assert s.dtype == 'Period[M]'
        for res, exp in zip(s, vals):
            assert isinstance(res, pd.Period)
            assert res.freq == 'M'
            assert res == exp


@pytest.mark.parametrize(
    'array, expected_type, dtype',
    [
        (np.array([0, 1], dtype=np.int64), np.ndarray, 'int64'),
        (np.array(['a', 'b']), np.ndarray, 'object'),
        (pd.Categorical(['a', 'b']), pd.Categorical, 'category'),
        (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), DatetimeArray,
         'datetime64[ns, US/Central]'),
        (pd.PeriodIndex([2018, 2019], freq='A'), pd.core.arrays.PeriodArray,
         pd.core.dtypes.dtypes.PeriodDtype("A-DEC")),
        (pd.IntervalIndex.from_breaks(
            [0, 1, 2]), pd.core.arrays.IntervalArray, 'interval'),

        # This test is currently failing for datetime64[ns] and timedelta64[ns].
        # The NumPy type system is sufficient for representing these types, so
        # we just use NumPy for Series / DataFrame columns of these types (so
        # we get consolidation and so on).
        # However, DatetimeIndex and TimedeltaIndex use the DateLikeArray
        # abstraction to for code reuse.
        # At the moment, we've judged that allowing this test to fail is more
        # practical that overriding Series._values to special case
        # Series[M8[ns]] and Series[m8[ns]] to return a DateLikeArray.
        pytest.param(
            pd.DatetimeIndex(['2017', '2018']),
Exemple #20
0
def model(params,
          series,
          exogen,
          yearly_seasonality,
          alpha=None,
          beta=None,
          gamma=None,
          omega=None,
          epsilon=None,
          smoothing=None):
    """
    This function runs an ETS(M,Ad,M) model with exogen variables. This is an Error, Trend, Seasonality exponential smoothing
    model.The first M stands for multiplicative or relative errors, the Ad for an additive dampend trend and the last M for
    multiplicative seasonality. The model also contains additional exogen variables which are dummies for certain events.
    The actual computation of the fit model is done in the function ETS_M_Ad_M which further contains the functions
    calc_new_estimates, calc_error, save_estimates and seasonal_matrices. These are all explained in the following code.
    
    Parameters:

        params: model parameters

        series: the time series in a pandas Series format

        exog: the exogen variables in a pandas DataFrame format with each column being a variable and the time as its index
    
    Return: The function returns the sum of squared error of the fitted model. This allows the model to be inputed
    into an optimizer which minimizes the sum of squared residuals dependent on the input parameters (params).
    """
    # defining all model parameters from the params vector
    # Note that the seasonal and exogen variable parameters are vectors while the other parameters are scalars

    if smoothing:
        alpha = alpha
        beta = beta
        gamma = gamma
        omega = omega
        epsilon = epsilon
        level_initial = params[0]
        slope_initial = params[1]
        seasonal_initial = params[
            2:
            9]  #prior we hade a np.vstack around this and this broke it down i think

        # added len(exogen) as now we have variable number of exogen variables due to days before and after

        reg = (params[9:9 + len(exogen.columns)])

    else:

        alpha = params[0]
        beta = params[1]
        gamma = params[2]
        omega = params[3]
        level_initial = params[4]
        slope_initial = params[5]
        seasonal_initial = params[6:13]

        #added len(exogen) as now we have variable number of exogen variables due to days before and after

        reg = (params[13:13 + len(exogen.columns)])

    #defining the initial yearly seasonal components as a fourier series

    if yearly_seasonality == "fourier":

        # defining the index as a date variable which will become relevant for subsequent computation
        yearly = pd.DataFrame({'date': series.index})
        yearly = yearly.set_index(pd.PeriodIndex(series.index, freq='D'))

        # yearly seasonality with N=10 fourier series elements

        # N=1
        yearly['yearly_sin365'] = np.sin(2 * np.pi * yearly.index.dayofyear /
                                         365.25)
        yearly['yearly_cos365'] = np.cos(2 * np.pi * yearly.index.dayofyear /
                                         365.25)
        # N=2
        yearly['yearly_sin365_2'] = np.sin(4 * np.pi * yearly.index.dayofyear /
                                           365.25)
        yearly['yearly_cos365_2'] = np.cos(4 * np.pi * yearly.index.dayofyear /
                                           365.25)
        # N=3
        yearly['yearly_sin365_3'] = np.sin(6 * np.pi * yearly.index.dayofyear /
                                           365.25)
        yearly['yearly_cos365_3'] = np.cos(6 * np.pi * yearly.index.dayofyear /
                                           365.25)
        # N=4
        yearly['yearly_sin365_4'] = np.sin(8 * np.pi * yearly.index.dayofyear /
                                           365.25)
        yearly['yearly_cos365_4'] = np.cos(8 * np.pi * yearly.index.dayofyear /
                                           365.25)
        # N=5
        yearly['yearly_sin365_5'] = np.sin(10 * np.pi *
                                           yearly.index.dayofyear / 365.25)
        yearly['yearly_cos365_5'] = np.cos(10 * np.pi *
                                           yearly.index.dayofyear / 365.25)
        # N=6
        yearly['yearly_sin365_6'] = np.sin(12 * np.pi *
                                           yearly.index.dayofyear / 365.25)
        yearly['yearly_cos365_6'] = np.cos(12 * np.pi *
                                           yearly.index.dayofyear / 365.25)
        # N=7
        yearly['yearly_sin365_7'] = np.sin(14 * np.pi *
                                           yearly.index.dayofyear / 365.25)
        yearly['yearly_cos365_7'] = np.cos(14 * np.pi *
                                           yearly.index.dayofyear / 365.25)
        # N=8
        yearly['yearly_sin365_8'] = np.sin(16 * np.pi *
                                           yearly.index.dayofyear / 365.25)
        yearly['yearly_cos365_8'] = np.cos(16 * np.pi *
                                           yearly.index.dayofyear / 365.25)
        # N=9
        yearly['yearly_sin365_9'] = np.sin(18 * np.pi *
                                           yearly.index.dayofyear / 365.25)
        yearly['yearly_cos365_9'] = np.cos(18 * np.pi *
                                           yearly.index.dayofyear / 365.25)
        # N=10
        yearly['yearly_sin365_10'] = np.sin(20 * np.pi *
                                            yearly.index.dayofyear / 365.25)
        yearly['yearly_cos365_10'] = np.cos(20 * np.pi *
                                            yearly.index.dayofyear / 365.25)

        # deleting date column as it is no longer required and should not be in the linear regression

        del yearly['date']

        if smoothing:

            # 1. compute the fourier series results from the weights times the cos(t) and sin(t) for t=0...365

            yearly_init = params[9 + len(exogen.columns):29 +
                                 len(exogen.columns)] * yearly.iloc[0:365]

            # 2. sum up the total yearly seasonality of each day by summing up all weighted trigonometric functional values

            yearly_init = 1 + yearly_init.sum(axis=1)

            # 3. define this array of 365 dummies as an array

            yearly_init = yearly_init  #np.vstack(yearly_init)

            # 4. turn the array around as we want the most recent seasonality effect to be at the end

            yearly_init = yearly_init[::-1]

            # yearly smoothing parameter

            epsilon = epsilon

        else:

            # 1. compute the fourier series results from the weights times the cos(t) and sin(t) for t=0...365

            yearly_init = params[13 + len(exogen.columns):33 +
                                 len(exogen.columns)] * yearly.iloc[0:365]

            # 2. sum up the total yearly seasonality of each day by summing up all weighted trigonometric functional values

            yearly_init = 1 + yearly_init.sum(axis=1)

            # 3. define this array of 365 dummies as an array

            yearly_init = yearly_init  #np.vstack(yearly_init)

            # 4. turn the array around as we want the most recent seasonality effect to be at the end

            yearly_init = yearly_init[::-1]

            # yearly smoothing parameter

            epsilon = params[33 + len(exogen.columns)]

    #Built in exception that gives out the parameters and the error sum if an error in the model occurs
    #Note that the exception gives back yearly seasonality parameters if they are specified in the model

    #For the dummy model we have 12 dummies and a smoothing parameter

    elif yearly_seasonality == "dummies":

        if smoothing:
            yearly_init = (params[9 + len(exogen.columns):21 +
                                  len(exogen.columns)])
            epsilon = epsilon
        else:
            yearly_init = (params[13 + len(exogen.columns):25 +
                                  len(exogen.columns)])
            epsilon = params[25 + len(exogen.columns)]

    try:
        if yearly_seasonality == "fourier" or yearly_seasonality == 'dummies':
            results = ETS_M_Ad_M(alpha, beta, gamma, omega, level_initial,
                                 slope_initial, seasonal_initial, reg, series,
                                 exogen, yearly_seasonality, yearly_init,
                                 epsilon)
        else:
            results = ETS_M_Ad_M(alpha,
                                 beta,
                                 gamma,
                                 omega,
                                 level_initial,
                                 slope_initial,
                                 seasonal_initial,
                                 reg,
                                 series,
                                 exogen,
                                 yearly_seasonality,
                                 yearly_init=None,
                                 epsilon=None)

    except:
        if yearly_seasonality == "fourier":
            print('alpha:', alpha, 'beta:', beta, 'gamma:', gamma, 'omega:',
                  omega, level_initial, slope_initial, seasonal_initial,
                  'reg:', reg, 'Fourier weights:',
                  params[13 + len(exogen.columns):33 + len(exogen.columns)],
                  'epsilon:', params[33 + len(exogen.columns)])
            if error_sum:
                print('error_sum:', error_sum)
        if yearly_seasonality == "dummies":
            print('alpha:', alpha, 'beta:', beta, 'gamma:', gamma, 'omega:',
                  omega, level_initial, slope_initial, seasonal_initial,
                  'reg:', reg, 'monthly dummies:',
                  params[13 + len(exogen.columns):25 + len(exogen.columns)],
                  'epsilon:', params[25 + len(exogen.columns)])
            if error_sum:
                print('error_sum:', error_sum)
        else:
            print('alpha:', alpha, 'beta:', beta, 'gamma:', gamma, 'omega:',
                  omega, level_initial, slope_initial, seasonal_initial,
                  'reg:', reg)
            if error_sum:
                print('error_sum:', error_sum)

    error_list = results['errors_list']

    error_list = [number**2 for number in error_list]

    error_sum = sum(error_list)

    return error_sum
Exemple #21
0
    def scrape(self, metrics: Set[str], country_filter: Optional[List[str]] = None,
               region_filter: Optional[List[str]] = None):

        columns = {
            'idx': deque(),
            'date': deque(),
            'country': deque(),
            'country_region': deque(),
            'region': deque(),
            # 'series': deque(),
            # 'weekday': deque(),
            # 'value': deque()
        }

        for metric_name in metrics:
            assert metric_name in self.metrics
            columns[metric_name] = deque()

        for idx, info in enumerate(self.download()):
            file_name, date = info
            with open(file_name, 'rb') as file:
                data: Dict[str] = json.load(file)
            transformed_data = deque([
                {
                    'country': 'Ukraine',
                    'regions': data['ukraine']
                }
            ])

            for country_data in data['world']:
                if country_data['country'] == 'Ukraine':
                    continue
                all_region_data = {'region': {'label': {'en': 'all'}}}
                all_region_data.update(country_data)
                transformed_data.append(
                    {
                        'country': country_data['country'],
                        'regions': [all_region_data]
                    }
                )
            # del datasets
            for country_data in transformed_data:
                country_name = country_data['country']
                if country_filter and country_name not in country_filter:
                    continue

                for region_data in country_data['regions']:
                    columns['idx'].append(idx)
                    columns['date'].append(date)
                    columns['country'].append(country_name)
                    columns['region'].append(region_data['label']['en'])
                    columns['country_region'].append(f"{country_name}_{region_data['label']['en']}")
                    for metric_name in metrics:
                        columns[metric_name].append(float(region_data[metric_name]))

        df = pd.DataFrame.from_dict(columns)
        df.date = pd.to_datetime(df.date)
        df.date.index = pd.PeriodIndex(df.date, freq="D", name="Period")
        df.country = df.country.astype('category')
        df.region = df.region.astype('category')
        df.country_region = df.country_region.astype('category')
        df['country_cat'] = df.country.cat.codes
        df['region_cat'] = df.region.cat.codes
        df['country_region_cat'] = df.country_region.cat.codes

        for idx, day_name in enumerate(calendar.day_name):
            df[day_name] = df['date'].apply(
                lambda x: 1. if x.day_name() == day_name else .0)

        logger.info(f"Dataset range: {df['date'].min()} - {df['date'].max()}")
        self._dataframe = df
        if region_filter is not None:
            self._dataframe = self._dataframe[self._dataframe.region.isin(region_filter)]
        return self._dataframe
Exemple #22
0
            assert res == exp


@pytest.mark.parametrize(
    "array, expected_type, dtype",
    [
        (np.array([0, 1], dtype=np.int64), np.ndarray, "int64"),
        (np.array(["a", "b"]), np.ndarray, "object"),
        (pd.Categorical(["a", "b"]), pd.Categorical, "category"),
        (
            pd.DatetimeIndex(["2017", "2018"], tz="US/Central"),
            DatetimeArray,
            "datetime64[ns, US/Central]",
        ),
        (
            pd.PeriodIndex([2018, 2019], freq="A"),
            PeriodArray,
            pd.core.dtypes.dtypes.PeriodDtype("A-DEC"),
        ),
        (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval"),
        # This test is currently failing for datetime64[ns] and timedelta64[ns].
        # The NumPy type system is sufficient for representing these types, so
        # we just use NumPy for Series / DataFrame columns of these types (so
        # we get consolidation and so on).
        # However, DatetimeIndex and TimedeltaIndex use the DateLikeArray
        # abstraction to for code reuse.
        # At the moment, we've judged that allowing this test to fail is more
        # practical that overriding Series._values to special case
        # Series[M8[ns]] and Series[m8[ns]] to return a DateLikeArray.
        pytest.param(
            pd.DatetimeIndex(["2017", "2018"]),
Exemple #23
0
 def test_is_period(self):
     self.assertTrue(lib.is_period(pd.Period('2011-01', freq='M')))
     self.assertFalse(lib.is_period(pd.PeriodIndex(['2011-01'], freq='M')))
     self.assertFalse(lib.is_period(pd.Timestamp('2011-01')))
     self.assertFalse(lib.is_period(1))
     self.assertFalse(lib.is_period(np.nan))
Exemple #24
0
def test_eq(other):
    idx = pd.PeriodIndex(['2017', '2017', '2018'], freq="D")
    expected = np.array([True, True, False])
    result = idx == other

    tm.assert_numpy_array_equal(result, expected)
Exemple #25
0
},
                  columns=pd.Index(['left', 'right'], name='side'))
df
df.unstack('state')

# In[ ]:

df.unstack('state').stack('side')

# ### Pivoting “Long” to “Wide” Format

# In[ ]:

data = pd.read_csv('examples/macrodata.csv')
data.head()
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
columns = pd.Index(['realgdp', 'infl', 'unemp'], name='item')
data = data.reindex(columns=columns)
data.index = periods.to_timestamp('D', 'end')
ldata = data.stack().reset_index().rename(columns={0: 'value'})

# In[ ]:

ldata[:10]

# In[ ]:

pivoted = ldata.pivot('date', 'item', 'value')
pivoted

# In[ ]:
def test_meta_nonempty_index():
    idx = pd.RangeIndex(1, name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.RangeIndex
    assert res.name == idx.name

    idx = pd.Int64Index([1], name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.Int64Index
    assert res.name == idx.name

    idx = pd.Index(['a'], name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.Index
    assert res.name == idx.name

    idx = pd.DatetimeIndex(['1970-01-01'], freq='d',
                           tz='America/New_York', name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.DatetimeIndex
    assert res.tz == idx.tz
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.PeriodIndex(['1970-01-01'], freq='d', name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.PeriodIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.TimedeltaIndex([np.timedelta64(1, 'D')], freq='d', name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.TimedeltaIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.CategoricalIndex(['xyx'], ['xyx', 'zzz'], ordered=True, name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert (res.categories == idx.categories).all()
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES], ordered=True, name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    levels = [pd.Int64Index([1], name='a'),
              pd.Float64Index([1.0], name='b')]
    idx = pd.MultiIndex(levels=levels, labels=[[0], [0]], names=['a', 'b'])
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names

    levels = [pd.Int64Index([1], name='a'),
              pd.CategoricalIndex(data=['xyx'], categories=['xyx'], name='b'),
              pd.TimedeltaIndex([np.timedelta64(1, 'D')], name='timedelta')]
    idx = pd.MultiIndex(levels=levels, labels=[[0], [0], [0]], names=['a', 'b', 'timedelta'])
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names
Exemple #27
0
def test_is_period_arraylike():
    assert not com.is_period_arraylike([1, 2, 3])
    assert not com.is_period_arraylike(pd.Index([1, 2, 3]))
    assert com.is_period_arraylike(pd.PeriodIndex(["2017-01-01"], freq="D"))
Exemple #28
0
 def test_is_period(self):
     assert lib.is_period(pd.Period("2011-01", freq="M"))
     assert not lib.is_period(pd.PeriodIndex(["2011-01"], freq="M"))
     assert not lib.is_period(pd.Timestamp("2011-01"))
     assert not lib.is_period(1)
     assert not lib.is_period(np.nan)
Exemple #29
0
                         'All causes, by age (years), All Ages**':'all',
                         'All causes, by age (years), LT 1':'LT 1',
                         'All causes, by age (years), 1–24':'1-24',
                         'All causes, by age (years), 25–44':'25-44',
                         'All causes, by age (years), 45–64':'45-64',
                         'All causes, by age (years), ≥65':'65+',
                         'P&I† Total':'total'},
                         inplace = True)

#totals in regions (quarters)
regions = deaths.loc[0:8, "area"].tolist()
deaths = deaths[deaths["area"].isin(regions)]

deaths["date"] = pd.to_datetime(deaths.year.astype(str), format='%Y') + \
                     pd.to_timedelta((deaths.week-1).mul(7).astype(str) + ' days')
deaths_Q = deaths.groupby(["area", pd.PeriodIndex(deaths.date, freq='Q')])["all"].sum()
deaths_Q = deaths_Q.unstack().rename(columns = {
            "2016Q1" : "2016Q1",
            "2016Q2" : "2016Q2",
            "2016Q3" : "2016Q3",
        })

deaths_Q.plot(kind='bar', stacked=True, rot=90, edgecolor='black')
plt.xlabel('area')
plt.ylabel('deaths')
plt.tight_layout(pad=0., w_pad=-16.5, h_pad=0.0)

deaths_Q.T.plot(kind='bar', stacked=True, rot=0, edgecolor='black')
plt.xlabel('area')
plt.ylabel('deaths')
Exemple #30
0
rng = pd.period_range('2000-01-01', '2000-06-30', freq='M')  # sequence of periods within the range; PeriodIndex

period.asfreq('M', how='start')  # convert Period into another frequency
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts.asfreq('M', how='start')  # convert a PeriodIndex

p = pd.Period('2012Q4', freq='Q-JAN')
p4pm = (p.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60  # second-to-last business day of the quarter
rng = p4pm.to_timestamp()  # convert to Timestamp

rng = pd.date_range('2000-01-02', periods=3, freq='M')
ts = pd.Series(np.random.randn(3), index=rng)
ts.to_period()  # return back

data = pd.read_csv('./data/macrodata.csv')  # having a year and quarter attributes
index = pd.PeriodIndex(year=data.year, quarter=data.quarter, freq='Q-DEC')  # combine to form an index
data.index = index  # assign it
print(data.infl)

# Resampling and Frequency Conversion
rng = pd.date_range('2000-01-01', periods=12, freq='T')
ts = pd.Series(np.arange(12), index=rng)
ts.resample('5min', closed='right', label='left', loffset='-1s').sum()  # a groupby and aggregation
ts.resample('5min').ohlc()  # compute open, high, low, close

index=pd.date_range('1/1/2000', periods=2, freq='W-WED')
frame = pd.DataFrame(np.random.randn(2, 4), index=index, columns=['Colorado', 'Texas', 'New York', 'Ohio'])
frame.resample('D').asfreq()  # no aggregation
frame.resample('D').ffill(limit=2)  # fulfill the NaN

frame = pd.DataFrame(np.random.randn(24, 4), index=pd.period_range('1-2000', '12-2001', freq='M'),