Exemple #1
0
def stationarityTest(data):
    diff = 0
    adf_data = ADF(data)
    while adf_data[1] >= 0.05:
        diff += 1
        adf_data = ADF(data.diff(diff).dropna())
    return (diff, adf_data[1])
Exemple #2
0
def adf_diff(data: pd.DataFrame, plot: bool = False) -> int:
    """
    ADF检验 -> d 
    """
    # diff & fillna
    data_diff1 = data.diff(1).fillna(0.0)
    data_diff2 = data.diff(1).diff(1).fillna(0.0)
    # ADF
    data_adf = ADF(data)
    data_diff1_adf = ADF(data_diff1)
    data_diff2_adf = ADF(data_diff2)
    # get p
    p = 0
    for i, adf in enumerate([data_adf, data_diff1_adf, data_diff2_adf]):
        t_val, p_val, _, _, ts, _ = adf
        if t_val < min(ts.values()):
            p = i
            print('p={}\nadf={}'.format(i, adf))
            break
        else:
            p += i
    if plot:
        plt.figure(figsize=(20, 5))
        plt.plot(data, label='Original', color='blue')
        plt.plot(data_diff1, label='Diff1', color='red')
        plt.plot(data_diff2, label='Diff2', color='green')
        plt.legend(loc='best')
        plt.title("{}".format(index))
        plt.show()

    return p
def practice1_saling_data_analysis():
    data = pd.read_csv('L09-TimeSeriesPrediction/data/arima_data.csv', index_col='date')
    data.index = pd.to_datetime(data.index)    # 将字符串索引转换为时间索引
    data.plot()
    plt.show()

    # plot_acf(data).show()
    # plt.show()
    # plot_pacf(data).show()
    # plt.show()
    
    # testing(data)    # 初次检验
    
    period = 0    # 原始序列
    D_data = data
    ADF_p = ADF(D_data)[1]
    acorr_ljungbox_p = list(acorr_ljungbox(D_data, lags=1)[1])[0]

    # 通过ADF检验和白噪声检验,确定可以分析的平稳的非白噪声序列
    while ADF_p >= 0.05 || acorr_ljungbox_p >= 0.05:
        period += 1
        D_data = D_data.diff(periods=period).dropna()
        ADF_p = ADF(D_data)[1]
        acorr_ljungbox_p = list(acorr_ljungbox(D_data, lags=1)[1])[0]

    p, q = order_determination(data, D_data)
def diff(time_series, if_plot, name, if_diff):
    """
    times_seris: time_series, pd.Dataframe.
    if_plot: boolen value indicating whether to plot.
    name: string value indicating name of the time series.
    if_diff: boolen value indicating whether to diff.
    return stationary time_series, counts of diff when the time_series become stationary.
    """
    counts = 0 # indicating how many times the series diffs.
    copy_series = copy.deepcopy(time_series)

    # directly return if_diff False.
    if not if_diff:
        return copy_series, counts
    
    # keep diff until ADF test's p-value is smaller than 1%.
    while ADF(copy_series.tolist())[1] > 0.05:
        logger.info("time " + str(counts) + " ADF test: " + str(ADF(copy_series.tolist())))
        copy_series = copy_series.diff(1)
        copy_series = copy_series.fillna(0)
        counts += 1
    
    logger.info("time " + str(counts) + " ADF test: " + str(ADF(copy_series.tolist())))

    # plot diff and original time series in one graph.
    if if_plot:
        plot_diff(time_series, copy_series, counts, name)
    
    return copy_series, counts
Exemple #5
0
def get_adf():
    infile = "../data/discdata_processed.xls"
    data = pd.read_excel(infile)
    data = data.iloc[:len(data) - 5]
    adf = ADF(data["CWXT_DB:184:D:\\"])
    diff = 0
    while adf[1] > 0.05:
        diff += 1
        adf = ADF(data["CWXT_DB:184:D:\\"].diff(diff).dropna())

    print("经过%d阶差分后归于平稳,p值为%s" % (diff, adf[1]))
Exemple #6
0
def stationarity_test(dataset, number):
    data = dataset.copy()
    data = data.iloc[:len(data) - number]  #不检测最后number个数据
    #平稳性检测
    from statsmodels.tsa.stattools import adfuller as ADF
    diff = 0
    adf = ADF(data['rentNumber'])
    while adf[1] > 0.05:
        diff = diff + 1
        adf = ADF(data['rentNumber'].diff(diff).dropna())
    print(u'原始序列经过%s阶差分后归于平稳,p值为%s' % (diff, adf[1]))
Exemple #7
0
def arima_regression():
    # 参数初始化
    discfile = SRC_PATH + '/data/arima_data.xls'
    forecastnum = 5

    # 读取数据,指定日期列为指标,Pandas自动将“日期”列识别为Datetime格式
    data = pd.read_excel(discfile, index_col=u'日期')

    # 时序图
    data.plot()
    plt.show()

    # 自相关图
    plot_acf(data).show()
    print u'原始序列的ADF检验结果为:', ADF(data[u'销量'])

    # 差分后的结果
    D_data = data.diff().dropna()
    D_data.columns = [u'销量差分']
    D_data.plot()  # 时序图
    plt.show()

    print data
    print D_data

    plot_acf(D_data).show()  # 自相关图
    plot_pacf(D_data).show()  # 偏自相关图
    print u'差分序列的ADF检验结果为:', ADF(D_data[u'销量差分'])  # 平稳性检测

    # 白噪声检验
    print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1))  # 返回统计量和p值

    data[u'销量'] = data[u'销量'].astype(float)
    # 定阶
    pmax = int(len(D_data) / 10)  # 一般阶数不超过length/10
    qmax = int(len(D_data) / 10)  # 一般阶数不超过length/10
    bic_matrix = []  # bic矩阵
    for p in range(pmax + 1):
        tmp = []
        for q in range(qmax + 1):
            try:  # 存在部分报错,所以用try来跳过报错。
                tmp.append(ARIMA(data, (p, 1, q)).fit().bic)
            except:
                tmp.append(None)
        bic_matrix.append(tmp)

    bic_matrix = pd.DataFrame(bic_matrix)  # 从中可以找出最小值

    p, q = bic_matrix.stack().idxmin()  # 先用stack展平,然后用idxmin找出最小值位置。
    print(u'BIC最小的p值和q值为:%s、%s' % (p, q))
    model = ARIMA(data, (p, 1, q)).fit()  # 建立ARIMA(0, 1, 1)模型
    model.summary2()  # 给出一份模型报告
    model.forecast(5)  # 作为期5天的预测,返回预测结果、标准误差、置信区间。
Exemple #8
0
def check_steady(data):
    #平稳性检测function
    from statsmodels.tsa.stattools import adfuller as ADF
    print(u'原始序列的ADF检验结果为:')
    print('返回所有的信息:', ADF(data))
    #返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore
    print('返回P_VALUE:', ADF(data)[1])
    print('---->>>>将计算获得的p-value与显著性水平数值0.05比较,大于该数值说明该序列不是平稳序列,反之是平稳序列!')
    if ADF(data)[1] < 0.05:
        print('----->>>>>STEADY!')
    else:
        print('----->>>>>NOT STEADY!')
def cal_d(df):
    if ADF(df.tmid)[1] < 0.05 or acorr_ljungbox(df, lags=1)[1] < 0.05:
        return 0
    d = 1
    d_df = df.diff(periods=1, axis=0).dropna()
    #平稳性检验、白噪声检验
    while ADF(d_df.tmid)[1] >= 0.05 or acorr_ljungbox(d_df, lags=1)[1] >= 0.05:
        d_df = d_df.diff(periods=1, axis=0).dropna()
        d = d + 1
        if d >= 2:
            return 2
    return d
Exemple #10
0
def arima():


    import matplotlib.pyplot as plt
    import pandas as pd
    import numpy as np

    test_data = list([random.randint(1, 20) for i in range(49)])

    # 时序图
    data = pd.Series(test_data)
    data.plot()
    plt.show()

    # 自相关
    from statsmodels.graphics.tsaplots import plot_acf
    #plot_acf(data).show()

    # 平稳性检测
    from statsmodels.tsa.stattools import adfuller as ADF
    print('original ADF result is', ADF(data))

    D_data = data.diff(3).dropna()
    #print(D_data)
    D_data.plot()
    plt.show()
    print('Diffenciate ADF result is', ADF(D_data))

    from statsmodels.tsa.arima_model import ARIMA

    # 定阶
    pmax = int(len(D_data) / 10)
    qmax = int(len(D_data) / 10)
    bic_matrix = []
    for p in range(pmax + 1):
        tmp = []
        for q in range(qmax + 1):
            try:
                tmp.append(ARIMA(data, (p, 1, q)).fit().bic)
            except:
                tmp.append(None)
        bic_matrix.append(tmp)

    print(bic_matrix)
    # 展平后找出最小的位置
    bic_matrix = pd.DataFrame(bic_matrix)
    p, q = bic_matrix.stack().idxmin()
    print('BIC minimum p and q is', p, q)
    model = ARIMA(data, (p, 1, q)).fit()
    model.summary2()
    model.forecast(5)
def stationarity_test():
    discfile = './data/discdata_processed.xls'

    data = pd.read_excel(discfile)
    data = data.iloc[:len(data) - 5]

    from statsmodels.tsa.stattools import adfuller as ADF
    diff = 0
    adf = ADF(data["CWXT_DB:184:D:\\"])
    while adf[1] >= 0.05:
        diff += 1
        adf = ADF(data["CWXT_DB:184:D:\\"].diff(diff).dropna())

    print("原始序列经过{}阶差分后归于平稳,对应的p值为{}".format(diff, adf[1]))
Exemple #12
0
def session_2():
    data = pd.read_csv('discdata_processed.csv')

    # 去除最后5个数据,不使用最后5个数据
    predict_num = 5
    data = data.iloc[:len(data) - predict_num]

    # 平稳性检测
    diff = 0
    adf = ADF(data['CWXT_DB:184:D:\\'])
    while adf[1] > 0.05:  # adf[1]为p值,p值小于0.05可认为是平稳的
        diff = diff + 1
        adf = ADF(data['CWXT_DB:184:D:\\'].diff(diff).dropna())

    print('原始序列经过%s阶差分后归于平稳,p值为%s' % (diff, adf[1]))
Exemple #13
0
def programmer_2():
    discfile = "data/discdata_processed.xls"
    data = pd.read_excel(discfile)
    # 去除最后5个数据
    predictnum = 5
    data = data.iloc[:len(data) - predictnum]

    # 平稳性检测
    diff = 0
    adf = ADF(data["CWXT_DB:184:D:\\"])
    while adf[1] > 0.05:
        diff = diff + 1
        adf = ADF(data["CWXT_DB:184:D:\\"].diff(diff).dropna())

    print(u"原始序列经过%s阶差分后归于平稳,p值为%s" % (diff, adf[1]))
def adf_test(ts):
    adftest = ADF(ts, autolag='AIC')
    adf_res = pd.Series(adftest[0:4], index=['Test Statistic','p-value','Lags Used','Number of Observations Used'])

    for key, value in adftest[4].items():
        adf_res['Critical Value (%s)' % key] = value
    return adf_res
Exemple #15
0
def testStationarity(ts):
    dftest = ADF(ts)
    # 对上述函数求得的值进行语义描述
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)
Exemple #16
0
def evaluate(arima_inflow_data, arima_inflow_ground_truth, arima_outflow_data,
             arima_outflow_ground_truth):

    rmse_list = []
    mape_list = []
    length = len(arima_outflow_ground_truth)

    for i in range(length):
        # todo get the data of grid i and truth i
        inflow_observations = pd.Series(arima_inflow_data[i])
        inflow_ground_truth = arima_inflow_ground_truth[i]

        outflow_observations = pd.Series(arima_outflow_data[i])
        outflow_ground_truth = arima_outflow_ground_truth[i]

        # todo get the p, d, q and fit model
        print('original in ADF result is', ADF(inflow_observations, 1))
        inflow_D_data = inflow_observations.diff(1).dropna()
        print('Diffenciate in ADF result is', ADF(inflow_D_data, 1))

        print('original out ADF result is', ADF(outflow_observations, 1))
        outflow_D_data = outflow_observations.diff(1).dropna()
        print('Diffenciate out ADF result is', ADF(outflow_D_data, 1))

        # todo forecast and calculate the error
        p = 0
        q = 0
        # 0 0 0.01600
        # 1 0 0.023
        # 1 1 0.026
        # 0 1 0.01964

        inflow_model = ARIMA(inflow_observations.values, (p, 1, q)).fit()
        inflow_result, _b, _c = inflow_model.forecast(1)
        inflow_loss_item = inflow_result - inflow_ground_truth

        outflow_model = ARIMA(outflow_observations.values, (p, 1, q)).fit()
        outflow_result, _b, _c = outflow_model.forecast(1)
        outflow_loss_item = outflow_result - outflow_ground_truth

        rmse_list.append(inflow_loss_item)
        rmse_list.append(outflow_loss_item)
        mape_list.append(inflow_loss_item)
        mape_list.append(outflow_loss_item)

    rmse_list = np.array(rmse_list)
    return np.mean(np.square(rmse_list))**0.5, np.mean(np.abs(mape_list))
def testing(data):
    '''

        进行ADF平衡性检验 & 白噪声检验

    '''
    print('原始序列的ADF平衡性检验的结果为:', ADF(data['volume']))
    print('原始序列的白噪声检验的结果为:', acorr_ljungbox(data['volume'], lags=1))
Exemple #18
0
def caculate_ADF(data):
    # 返回值依次为adf、pvalue、nobs、critical values、icbest、regresult、resstore
    # adf:-0.0
    # pvalue: 0.95853208606005602
    # nobs: 8
    # critical values: 10
    # icbest: {'1%': -4.3315729999999997, '5%': -3.2329500000000002, '10%': -2.7486999999999999}
    # resstore: -414.96637673426136
    print(u'原始序列的ADF检验结果为:', ADF(data))
Exemple #19
0
def stationarityTest():
    '''
    平稳性检验
    :return:
    '''
    discfile = 'data/discdata_processed.xls'
    predictnum = 5

    data = pd.read_excel(discfile)
    data = data.iloc[:len(data) - predictnum]
    # 平稳性检验
    from statsmodels.tsa.stattools import adfuller as ADF
    diff = 0
    adf = ADF(data['CWXT_DB:184:D:\\'])
    while adf[1] > 0.05:
        diff = diff + 1
        adf = ADF(data['CWXT_DB:184:D:\\'].diff(diff).dropna())

    print(u'原始序列经过%s阶差分后归于平稳,p值为%s' % (diff, adf[1]))
Exemple #20
0
def stationarityTest(data):
    '''
    平稳性检验
    :return:
    '''

    # 平稳性检验
    from statsmodels.tsa.stattools import adfuller as ADF
    k = 0
    xdata = data['Y']
    adf = ADF(xdata)  #平稳性检测
    # print(u'原始序列平稳性检测的p值:',adf[1])
    while adf[1] >= 0.05:
        k = k + 1
        adf = ADF(xdata.diff(k).dropna())

    print(u'原始序列经过%s阶差分后归于平稳,p值为%s' % (k, adf[1]))

    return k
def Model_Determination(data):
    '''

        根据股票数据,确定对应的ARIMA(p, k, q)模型

    '''
    p, k, q = 0, 0, 0  # 原始序列
    D_data = data
    ADF_p = ADF(D_data['Open'])[1]
    acorr_ljungbox_p = list(acorr_ljungbox(D_data['Open'], lags=1)[1])[0]

    # 通过ADF检验和白噪声检验,确定可以分析的平稳的非白噪声序列
    while (ADF_p >= 0.05) or (acorr_ljungbox_p >= 0.05):
        k += 1
        D_data = data['Open'].diff(periods=k).dropna()
        ADF_p = ADF(D_data)[1]
        acorr_ljungbox_p = list(acorr_ljungbox(D_data, lags=1)[1])[0]

    p, q = order_determination(data, D_data, k)
    return p, k, q
Exemple #22
0
def stability_test(retrun_series):
    """"""
    statitstic = ADF(retrun_series)
    t_s = statitstic[1]
    t_c = statitstic[4]["5%"]

    if t_s > t_c:
        output("平稳性检验:存在单位根,时间序列不平稳")
    else:
        output("平稳性检验:不存在单位根,时间序列平稳")

    output(f"ADF检验结果:{statitstic}\n")
Exemple #23
0
def diff(timeseries):
    timeseries_diff1 = timeseries.diff(1)
    timeseries_diff2 = timeseries_diff1.diff(1)

    timeseries_diff1 = timeseries_diff1.fillna(0)
    timeseries_diff2 = timeseries_diff2.fillna(0)

    timeseries_adf = ADF(timeseries['value'].tolist())
    timeseries_diff1_adf = ADF(timeseries_diff1['value'].tolist())
    timeseries_diff2_adf = ADF(timeseries_diff2['value'].tolist())

    print('timeseries_adf : ', timeseries_adf)
    print('timeseries_diff1_adf : ', timeseries_diff1_adf)
    print('timeseries_diff2_adf : ', timeseries_diff2_adf)

    plt.figure(figsize=(16, 12))
    plt.plot(timeseries, label='Original', color='blue')
    plt.plot(timeseries_diff1, label='Diff1', color='red')
    plt.plot(timeseries_diff2, label='Diff2', color='purple')
    plt.legend(loc='best')
    plt.show()
def random_series():
    # Create a random series
    x = np.random.rand(100)
    plt.plot(x)
    plt.show()

    print('ADF平衡性检验的结果为:', ADF(x))
    print('白噪声检验的结果为:', acorr_ljungbox(x, lags=1))
    plot_acf(x).show()
    plt.show()
    plot_pacf(x).show()
    plt.show()
def stability_test(close_price):
    """"""
    statitstic = ADF(close_price)
    t_s = statitstic[1]
    t_c = statitstic[4]["10%"]

    if t_s > t_c:
        output("第三步:平稳性检验:存在单位根,时间序列不平稳")
    else:
        output("第三步:平稳性检验:不存在单位根,时间序列平稳")

    output(f"ADF检验结果:{statitstic}\n")
Exemple #26
0
def AdfTest(index_list):
    adftest = ADF(index_list)
    # 返回值依次为adf,pvalue,usedlag,nobs,critical values,icbest,regresults,resstore
    i = 0
    for key, value in adftest[4].items():
        if value < adftest[0]:
            i += 1
    # 假如adf值小于两个水平值,p值小于0.05,则判断为平稳序列
    if i <= 1 and adftest[1] < 0.01:
        return 1
    else:
        return 0
def stationarityTest():
    """
    为了确定原始数据序列中没有随机趋势或确定趋势,
    需要对数据进行平稳性检验,否则将会产生“伪回归”现象。
    本案例采用单位根检验(ADF)的方法或者时序图的方法进行平稳性检验。
    :return:
    """
    # 参数初始化
    discfile = "G:\\# Project\\数据集\\UsingDataSet\\Python数据分析与挖掘\\discdata_processed.xls"
    data = pd.read_excel(discfile)
    # 去除最后5个数据
    predictnum = 5
    data = data.iloc[:len(data) - predictnum]

    # 平稳性检测
    diff = 0
    adf = ADF(data["CWXT_DB:184:D:\\"])
    while adf[1] > 0.05:  # adf[1]为p值,p小于0.05认为是平稳的
        diff = diff + 1
        adf = ADF(data["CWXT_DB:184:D:\\"].diff(diff).dropna())

    print(u"原始序列经过%s阶差分后归于平稳,p值为%s" % (diff, adf[1]))
Exemple #28
0
def stationarityTest():
    '''
    检验时间序列稳定性:
    平稳性检验:
        为了确定原始数据序列中没有随机趋势或确定趋势,需要对数据进行平稳性检验,否则将会产生“伪回归”的现象。采用ADF方法来进行平稳性检验。
        p值小于0.05认为是平稳的
    :return:
    '''
    discfile = 'data/discdata_processed.csv'
    predictnum = 5
    data = pd.read_csv(discfile)  # 100
    data = data.iloc[: len(data) - predictnum] # 95
    # 平稳性检验
    from statsmodels.tsa.stattools import adfuller as ADF  # 单位根检测法
    diff = 0
    adf = ADF(data['CWXT_DB:184:D:\\'])
    # print(adf)
    # p值小于0.05认为是平稳的
    while adf[1] > 0.05:
        diff = diff + 1
        adf = ADF(data['CWXT_DB:184:D:\\'].diff(diff).dropna())
    print(u'原始序列经过%s阶差分后归于平稳,p值为%s' % (diff, adf[1]))
def test_parameters(sel_frame, target, params):
    # ARIMA(p,d,q)模型中选择合适模型,其中p为自回归项,d为差分阶数,q为移动平均项数。

    sel_frame = sel_frame.set_index(['createtime'])

    # # 自相关图
    # from statsmodels.graphics.tsaplots import plot_acf
    # plot_acf = plot_acf(sel_frame)
    # plot_acf.show()

    # # 偏自相关图
    # from statsmodels.graphics.tsaplots import plot_pacf
    # plot_pacf = plot_pacf(sel_frame)
    # plot_pacf.show()

    test_data = sel_frame[target]
    # 平稳性检测
    from statsmodels.tsa.stattools import adfuller as ADF
    # print(sel_frame['createtime'].tolist())l
    print(u'原始序列的ADF检验结果为(第一个返回值为adf,若小于1%5%10%均值则为平稳序列,d=0):', ADF(test_data))

    from statsmodels.stats.diagnostic import acorr_ljungbox
    # 返回统计量和p值
    print(u'差分序列的白噪声检验结果为(p值):', acorr_ljungbox(test_data, lags=1))

    # ARIMA,计算p和q
    # 一般阶数不超过length/10
    pmax = int(len(test_data) / 100)
    qmax = int(len(test_data) / 100)

    # bic矩阵
    bic_matrix = []
    for p in range(pmax + 1):
        tmp = []
        for q in range(qmax + 1):
            # 存在部分报错,所以用try来跳过报错。
            try:
                tmp.append(ARIMA(sel_frame, (p, params[1], q)).fit().bic)
            except Exception as err:
                print(err)
                tmp.append(None)
        bic_matrix.append(tmp)

    # 从中可以找出最小值
    bic_matrix = pd.DataFrame(bic_matrix)
    # 先用stack展平,然后用idxmin找出最小值位置。
    p, q = bic_matrix.stack().idxmin()

    print(u'BIC最小的p值和q值为:%s、%s' % (p, q))

    plt.show()
Exemple #30
0
def checkADF_d(y_ori, diffbegin, diffend):
    if ADF(DataFrame(y_ori)[u'VVALUE'])[1] < 0.05:
        pvalue = ADF(DataFrame(y_ori)[u'VVALUE'])[1]
        y_check = y_ori
        d = 0
        print('%s阶差分,pvalue:%s' % (0, pvalue))
        print(u'差分序列的ADF检验结果为', ADF(DataFrame(y)[u'VVALUE']))
    else:
        for i in range(diffbegin, diffend):
            #自定义差分阶范围,最好1,9
            y_dif = y_ori.diff(i).dropna()
            y_dif.columns = [u'VVALUE_dif']
            pvalue = ADF(DataFrame(y_dif)[u'VVALUE'])[1]
            d = i
            if pvalue < 0.05:
                #P明显小于0.05,一阶差分后序列为平稳序列
                print('%s阶差分,pvalue:%s' % (i, pvalue))
                print(u'差分序列的ADF检验结果为', ADF(DataFrame(y_dif)[u'VVALUE']))
                y_check = y_dif
                #修正后的时序图,可能是原序列或者差分序列
                y.plot()
                plt.show()
                #返回统计量和p值
                if float(acorr_ljungbox(y, lags=1)[1]) < 0.05:
                    print(u'原序列的白噪声检验结果通过为:', acorr_ljungbox(y, lags=1))
                    break
                else:
                    print(u'原序列的白噪声检验结果:当前序列无法拒绝假设,失败为:',
                          acorr_ljungbox(y, lags=1))
                    print(u'该差分下模型没有意义', acorr_ljungbox(y, lags=1))
                    #print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(y_data, lags=1))
                    #P值小于0.05,所以一阶差分后的序列为平稳非白噪声序列。,P》0.05则是白噪声,数据随机无可取价值信息
                    continue
            else:
                print('%s阶差分不能满足要求,结束' % i)
    return d, y_check