def test_teststat(self): with warnings.catch_warnings(record=True) as w: kpss_stat, pval, lags, crits = kpss(self.x, 'c', 3) assert_almost_equal(kpss_stat, 5.0169, DECIMAL_3) with warnings.catch_warnings(record=True) as w: kpss_stat, pval, lags, crits = kpss(self.x, 'ct', 3) assert_almost_equal(kpss_stat, 1.1828, DECIMAL_3)
def test_pval(self): with warnings.catch_warnings(record=True) as w: kpss_stat, pval, lags, crits = kpss(self.x, 'c', 3) assert_equal(pval, 0.01) with warnings.catch_warnings(record=True) as w: kpss_stat, pval, lags, crits = kpss(self.x, 'ct', 3) assert_equal(pval, 0.01)
def test_fail_unclear_hypothesis(self): # these should be fine, kpss(self.x, 'c') kpss(self.x, 'C') kpss(self.x, 'ct') kpss(self.x, 'CT') assert_raises(ValueError, kpss, self.x, "unclear hypothesis")
def test_store(self): with warnings.catch_warnings(record=True) as w: kpss_stat, pval, crit, store = kpss(self.x, 'c', 3, True) # assert attributes, and make sure they're correct assert_equal(store.nobs, len(self.x)) assert_equal(store.lags, 3)
def test_fail_unclear_hypothesis(self): # these should be fine, with warnings.catch_warnings(record=True) as w: kpss(self.x, 'c') kpss(self.x, 'C') kpss(self.x, 'ct') kpss(self.x, 'CT') assert_raises(ValueError, kpss, self.x, "unclear hypothesis")
def test_lags(self): # real GDP from macrodata data set with warnings.catch_warnings(record=True): lags = kpss(self.x, 'c', lags='auto')[2] assert_equal(lags, 9) # real interest rates from macrodata data set with warnings.catch_warnings(record=True): lags = kpss(sunspots.load().data['SUNACTIVITY'], 'c', lags='auto')[2] assert_equal(lags, 7) # volumes from nile data set with warnings.catch_warnings(record=True): lags = kpss(nile.load().data['volume'], 'c', lags='auto')[2] assert_equal(lags, 5) # log-coinsurance from randhie data set with warnings.catch_warnings(record=True): lags = kpss(randhie.load().data['lncoins'], 'ct', lags='auto')[2] assert_equal(lags, 75) # in-vehicle time from modechoice data set with warnings.catch_warnings(record=True): lags = kpss(modechoice.load().data['invt'], 'ct', lags='auto')[2] assert_equal(lags, 18)
def test_lags(self): with warnings.catch_warnings(record=True) as w: kpss_stat, pval, lags, crits = kpss(self.x, 'c') assert_equal(lags, int(np.ceil(12. * np.power(len(self.x) / 100., 1 / 4.))))
#Step 1 Generate Arma Process np.random.seed(100) arparams = np.array([0.6, -0.8]) maparams = np.array([.75]) ar = np.r_[1, -arparams] # add zero-lag and negate ma = np.r_[1, maparams] # add zero-lag y = sm.tsa.arma_generate_sample(ar, ma, 100) #model = sm.tsa.ARMA(y, (2, 1)).fit(trend='nc', disp=0) #Descriptive TSA Statistics stools.adfuller(y) stools.kpss(y) #Plot ACF and PACF tplot.plot_acf(y) tplot.plot_pacf(y) #Fir ARMA Model tsmodel = sm.tsa.ARMA(y, (2, 1)).fit(trend='nc', disp=0) residuals = tsmodel.resid stools.q_stat(tsmodel.resid, nobs=len(tsmodel.resid)) fig = plt.figure() qq_ax = fig.add_subplot() sm.qqplot(y, line='s', ax=qq_ax) plt.show()
def test_fail_nonvector_input(self): with warnings.catch_warnings(record=True) as w: kpss(self.x) # should be fine x = np.random.rand(20, 2) assert_raises(ValueError, kpss, x)
def test_unknown_lags(self): # Test legacy lags are the same with pytest.raises(ValueError): kpss(self.x, 'c', lags='unknown')
def __construct_list_order_difference__(serie): """ __Description__: La fonction effectue 2 tests afin d'estimer la stationnarité de la TS: * test de Dickey-Fuller augmenté * test de Kwiatkowski–Phillips–Schmidt–Shin La fonction se base par défaut sur les valeurs critiques à 5% mais cela peut être changé en modifiant la variable seuil (str: '1%','5%','10%') ou via alpha (mettre une valeur pour la valeur p; pqr défaut: 0.05). Pour test ADF, la valeur p est relative à la vraissemblance de l'hypo- -these H0 associée: H0 = il y a une racine unitaire ... Ce qui nous intéresse est l'hypot- -hèse alternative = la série est stationnaire ou stationnaire+tendance. Pour test KPSS, la valeur p est relative à la vraissemblance de l'hypo- -these H0 associée: H0 = la série est stationnaire avec une tendance, l'hypothèse alterna- -tive est qu'il y a une racine unitaire. __Return__: lst_d : [list] [0]/[1]/[0,1] les differentes valeurs pour d __Remarks__: Pour comprendre les résultats des tests: https://www.statsmodels.org/stable/examples/notebooks/generated/ stationarity_detrending_adf_kpss.html?highlight=stationarity Dans l'idée la fonction suit le principe suivant: %%%%%%%%%%%%%%%%%%%(issu de la page juste avant)%%%%%%%%%%%%%%%%%%%%%%%% Case 1: Both tests conclude that the series is not stationary -> The series is not stationary Case 2: Both tests conclude that the series is stationary -> The series is stationary Case 3: KPSS indicates stationarity and ADF indicates non-stationarity -> The series is trend stationary. Trend needs to be removed to make series strict stationary. The detrended series is checked for stationarity. Case 4: KPSS indicates non-stationarity and ADF indicates stationarity -> The series is difference stationary. Differencing is to be used to make series stationary. The differenced series is checked for stationarity. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% La valeur p correspond à un score de probabilité sur lequel nous pouvons décider de rejeter où ou non. Si p est inférieur à un critère alpha pré- -défini (typiquement 0.05), nous rejetons H0 La statistique du test (ADF/KPSS) est une grandeur basée sur une formule Pour rejeter H0, la valeur de la statistique doit être plus grande que la valeur critique considérée (et cela se reflète dans la faible valeur de p) """ reg_values_adf = ['nc', 'c', 'ct', 'ctt'] reg_values_kpss = ['c', 'ct'] adf_results = [] kpss_results = [] ADF_seuil = '1%' KPSS_seuil = '10%' alpha_ADF = 1e-4 alpha_KPSS = 1e-4 # /!\ p-value uniquement dans intervalle [0.01,0.1] for reg_val in reg_values_adf: adf_result = adfuller(serie, regression=reg_val) print("ADF__:", adf_result) adf_results.append([adf_result[0], adf_result[1], adf_result[4]]) for reg_val in reg_values_kpss: kpss_result = kpss(serie, regression=reg_val, nlags='auto') print("KPSS__:", kpss_result) kpss_results.append([kpss_result[0], kpss_result[1], kpss_result[3]]) stat_adf = False stat_kpss = False p_val_adf = False p_val_kpss = False for adf_elem in adf_results: if adf_elem[0] < adf_elem[2][ADF_seuil]: stat_adf = True if adf_elem[1] < alpha_ADF: p_val_adf = True for kpss_elem in kpss_results: if kpss_elem[0] < kpss_elem[2][KPSS_seuil]: stat_kpss = True if kpss_elem[1] > alpha_KPSS: p_val_kpss = True if stat_adf and stat_kpss and p_val_adf and p_val_kpss: print("[ADF] + [KPSS]: TS est stationnaire.") return ([0]) if not (stat_adf and p_val_adf) and not (stat_kpss and p_val_kpss): print("[ADF] + [KPSS]: TS n'est pas stationnaire.") return ([-1]) if (stat_adf and p_val_adf) and (not (stat_kpss and p_val_kpss)): print( "[ADF]: TS stationnaire\n[KPSS] non stationnaire\n --> Stationnaire apres differenciation" ) return ([1]) if (not (stat_adf and p_val_adf)) and (stat_kpss and p_val_kpss): print( "[KPSS]: TS stationnaire\n[ADF]: TS non stationnaire\n --> Staionnaire avec tendance." ) return ([1])
def test_teststat(self): kpss_stat, pval, lags, crits = kpss(self.x, 'c', 3) assert_almost_equal(kpss_stat, 5.0169, DECIMAL_3) kpss_stat, pval, lags, crits = kpss(self.x, 'ct', 3) assert_almost_equal(kpss_stat, 1.1828, DECIMAL_3)
def test_unknown_lags(self): # Test legacy lags are the same with pytest.raises(ValueError): kpss(self.x, 'c', nlags='unknown')
def test_fail_nonvector_input(self): kpss(self.x) # should be fine x = np.random.rand(20, 2) assert_raises(ValueError, kpss, x)
def test_legacy_lags(self): # Test legacy lags are the same with warnings.catch_warnings(record=True): lags = kpss(self.x, 'c', lags='legacy')[2] assert_equal(lags, 15)
split = round(s1.shape[0]/2) X1 = s1[0:int(split)] X2 = s1[int(split):] mean1, mean2 = X1.mean(), X2.mean() var1, var2 = X1.var(), X2.var() print('mean1=%f, mean2=%f' % (mean1, mean2)) print('variance1=%f, variance2=%f' % (var1, var2)) # to check the stationarity result = adfuller(s2['replace'], autolag="AIC") print(f'ADF Statistic: {result[0]}') print(f'p-value: {result[1]}') kpsstest = kpss(p1.Sales, regression='c') kpss_output = pd.Series(kpsstest[0:3], index=['Test Statistic', 'p-value', 'Lags Used']) kpss_output # This is used to find the value of d s1 = s1-s1.shift(1) #To check the stationarity result = adfuller(s1.dropna(), autolag='AIC') print(f'ADF Statistic: {result[0]}') print(f'p-value: {result[1]}') kpsstest = kpss(s1.dropna(), regression='c') kpss_output = pd.Series(kpsstest[0:3], index=['Test Statistic', 'p-value', 'Lags Used']) kpss_output
def unitroot( other_args: List[str], residuals: List[float], ): """Unit root test / stationarity (ADF, KPSS) Parameters ---------- other_args : str Command line arguments to be processed with argparse residuals : List[float] Residuals data """ parser = argparse.ArgumentParser( add_help=False, prog="arch", description=""" Unit root test / stationarity (ADF, KPSS) """, ) try: ns_parser = parse_known_args_and_warn(parser, other_args) if not ns_parser: return # The Augmented Dickey-Fuller test # Used to test for a unit root in a univariate process in the presence of serial correlation. # regression{‘c’,’ct’,’ctt’,’nc’} 'c' - Constant and 't'-trend order to include in regression # Note: 'ct' - The data is stationary around a trend result = adfuller(residuals, regression="c") print("Augmented Dickey Fuller Test") print("ADF Statistic: %.4f" % result[0]) print("p-value: %.4f" % result[1]) print("Used lags: %d" % result[2]) print("Num obs: %d" % result[3]) print("Critical Values:") d = OrderedDict(sorted(result[4].items(), key=lambda t: t[1])) for key, value in d.items(): print(f"\t{key}: {value:.3f}") print("") # Kwiatkowski-Phillips-Schmidt-Shin test # Test for level or trend stationarity # Note: regressionstr{‘c’, ‘ct’} # regressionstr{‘c’, ‘ct’} where: # ‘c’ : The data is stationary around a constant (default). # ‘ct’ : The data is stationary around a trend. # lags{None, ‘auto’, ‘legacy’} # see: https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.kpss.html print("Kwiatkowski-Phillips-Schmidt-Shin Test") result = kpss(residuals, regression="c", nlags="auto") print("KPSS Statistic: %.4f" % result[0]) print("Critical Values:") d = OrderedDict( sorted(result[3].items(), key=lambda t: t[1], reverse=True)) for key, value in d.items(): print(f"\t{key}: {value:.3f}") print("") except Exception as e: print(e, "\n") return
) st.write(f"ADF Statistic: {result[0]}") st.write(f"p-value: {result[1]}") st.write(f"n-lags: {result[2]}") st.write(f"observations: {result[3]}") st.write("Critical Values:") for key, value in result[4].items(): st.write(f"{key}: {value}") if result[1] <= 0.05: st.info("Hypothesis Rejected") else: st.warning("Test Inconclusive") with test_2: with st.beta_expander("KPSS Test (Stationary Test)"): regression = st.radio("regression", ["c", "ct"]) result = kpss(df.values, regression=regression) st.write( "<p style='color:green;'>H<sub>0</sub>: Stationary</p>", unsafe_allow_html=True, ) st.write(f"KPSS Statistic: {result[0]}") st.write(f"p-value: {result[1]}") st.write(f"n-lags: {result[2]}") st.write("Critical Values:") for key, value in result[3].items(): st.write(f"{key}: {value}") if result[1] <= 0.05: st.info("Hypothesis Rejected") else: st.warning("Test Inconclusive")
with open(path + "/" + folder + "/" + filename, "r") as input: array = [] for line in input: array.append(int(line)) file.write( "minimum: " + str(min(array)) + "\n" ) # compute all the metrics: min,max,mean,variance,std.dev,kpss,bds,hurst exponent file.write("maximum: " + str(max(array)) + "\n") file.write("mean: " + str(numpy.mean(array)) + "\n") file.write("variance: " + str(numpy.var(array)) + "\n") file.write("standard deviation: " + str(statistics.stdev(array)) + "\n") kpss_stat, p_value, lags, crit = stat.kpss( array ) # https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.kpss.html file.write("KPSS: " + str(kpss_stat) + "\n") start = 0 end = 200 max_bds = 0 while (end <= len(array) ): # sliding window with fixed size (200 elements) bds_stat, pvalue = stat.bds( array[start:end] ) # https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.bds.html if (bds_stat > max_bds): max_bds = bds_stat start += 1 end += 1
def test_pval(self): kpss_stat, pval, lags, crits = kpss(self.x, 'c', 3) assert_equal(pval, 0.01) kpss_stat, pval, lags, crits = kpss(self.x, 'ct', 3) assert_equal(pval, 0.01)
def test_legacy_lags(self): # Test legacy lags are the same with pytest.warns(InterpolationWarning): res = kpss(self.x, 'c', nlags='legacy') assert_equal(res[2], 15)
def test_store(self): kpss_stat, pval, crit, store = kpss(self.x, 'c', 3, True) # assert attributes, and make sure they're correct assert_equal(store.nobs, len(self.x)) assert_equal(store.lags, 3)
def test_deprecation(self): with pytest.warns(FutureWarning): kpss(self.x, 'c')
def test_lags(self): kpss_stat, pval, lags, crits = kpss(self.x, 'c') assert_equal(lags, int(np.ceil(12. * np.power(len(self.x) / 100., 1 / 4.))))
# xTrain = np.c_[xTrain, condData] # condData = nTrain['conductivity'].values[y.size/2-50: y.size-50] # xTest = np.c_[xTest, condData] # y1 = sklearn.preprocessing.normalize([nTrain['nitrateMg'].values], norm='l1').ravel() # y2 = sklearn.preprocessing.normalize([ nTrain['conductivity'].values], norm='l1').ravel() # plt.plot(nTrain['index'].values, y1, label='nitrate') # plt.plot(nTrain['index'].values,y2, label='condictivity') # plt.plot(nTrain['index'].values, y2-y1, label='residuals') # plt.legend() # print(y1.ravel()) # print(adfuller(y2-y1)) a, conductivity = remove_missing_values(index, conductivity) print(conductivity[0:10]) print(adfuller(nitrateMg)) print(kpss(nitrateMg)) # RANSAC_regresssion(xTrain, xTest, X1, yTrain, X2, yTest, "with conductivity") # Evaluating whether each method makes sense # if no relationship, no include # if high corrolation then include # if dtw distance is lower than corrolation then include some history # dwtD = get_dtw_d(nTrain['nitrateMg'], nTrain['conductivity']) # pcc = pearsonr(nTrain['nitrateMg'], nTrain['conductivity']) # print("Conductivity and nitrate") # print("DWT distance normalised: " + str(dwtD/2)) # print("Preason cc: " + str(pcc[0])) # print("-----") # dwtD = get_dtw_d(nTrain['nitrateMg'], nTrain['n'])
from statsmodels.tsa.stattools import adfuller, kpss # df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/a10.csv', parse_dates=['date']) # print('df_input_values:',df_input.values) # # ADF Test for classification in ['PA','PB','PC','PD','PE','PF','PG']: result = adfuller(df_input[classification], autolag='AIC') print(f'ADF Statistic {classification}: {result[0]}') print(f'p-value: {result[1]}') for key, value in result[4].items(): print('Critial Values:') print(f' {key}, {value}') # # KPSS Test for classification in ['PA','PB','PC','PD','PE','PF','PG']: result = kpss(df_input[classification], regression='c') print('\nKPSS Statistic:',classification,' %f' % result[0]) print('p-value: %f' % result[1]) for key, value in result[3].items(): print('Critial Values:') print(f' {key}, {value}') #Estimating and eliminating trend for classification in ['PA','PB','PC','PD','PE','PF','PG']: ts_log=np.log(df_input[classification]) print('datatype of ts_log:',type(ts_log)) print('ts_log: ',ts_log) plt.plot(ts_log) plt.show(block=False) # print(df.head())
def main(): _simulations = load.structured() _simulations = filtering.by_time_points_amount(_simulations, _time_points=TIME_POINTS) _simulations = filtering.by_categories( _simulations, _is_single_cell=True, _is_heterogeneity=False, _is_low_connectivity=False, _is_causality=False, _is_dominant_passive=False, _is_fibrin=False ) print('Total simulations:', len(_simulations)) _fiber_densities = compute_simulations_fiber_densities(_simulations) _kpss_y_arrays = [[] for _i in DERIVATIVES] _adf_y_arrays = [[] for _i in DERIVATIVES] for _simulation in tqdm(_simulations, desc='Simulations loop'): _cell_fiber_densities = \ [_fiber_densities[(_simulation, _direction)] for _direction in ['left', 'right', 'up', 'down']] _cell_fiber_densities = np.mean(_cell_fiber_densities, axis=0) for _derivative_index, _derivative in enumerate(DERIVATIVES): _cell_fiber_densities_derivative = compute_lib.derivative(_cell_fiber_densities, _n=_derivative) with warnings.catch_warnings(): warnings.simplefilter('ignore', category=InterpolationWarning) _, _kpss_p_value, _, _ = kpss(_cell_fiber_densities_derivative, nlags='legacy') _kpss_y_arrays[_derivative_index].append(_kpss_p_value) _, _adf_p_value, _, _, _, _ = adfuller(_cell_fiber_densities_derivative) _adf_y_arrays[_derivative_index].append(_adf_p_value) print('Total cells:', len(_kpss_y_arrays[0])) # print results print('KPSS:') for _derivative_index, _derivative in enumerate(DERIVATIVES): _stationary_count = len([_value for _value in _kpss_y_arrays[_derivative_index] if _value > 0.05]) print('Derivative:', _derivative, 'Stationary:', str(_stationary_count / len(_kpss_y_arrays[_derivative_index]) * 100) + '%') print('ADF:') for _derivative_index, _derivative in enumerate(DERIVATIVES): _stationary_count = len([_value for _value in _adf_y_arrays[_derivative_index] if _value < 0.05]) print('Derivative:', _derivative, 'Stationary:', str(_stationary_count / len(_adf_y_arrays[_derivative_index]) * 100) + '%') # plot _colors_array = config.colors(3) for _test_name, _y_title, _y_tickvals, _p_value_line, _y_arrays in \ zip( ['kpss', 'adf'], ['KPSS test p-value', 'ADF test p-value'], [[0.05, 0.1], [0.05, 1]], [0.05, 0.05], [_kpss_y_arrays, _adf_y_arrays] ): _fig = go.Figure( data=[ go.Box( y=_y, name=_derivative, boxpoints='all', jitter=1, pointpos=0, line={ 'width': 1 }, fillcolor='white', marker={ 'size': 10, 'color': _color }, opacity=0.7, showlegend=False ) for _y, _derivative, _color in zip(_y_arrays, DERIVATIVES_TEXT, _colors_array) ], layout={ 'xaxis': { 'title': 'Fiber density derivative', 'zeroline': False }, 'yaxis': { 'title': _y_title, 'zeroline': False, 'tickmode': 'array', 'tickvals': _y_tickvals }, 'shapes': [ { 'type': 'line', 'x0': DERIVATIVES[0] - 0.75, 'y0': _p_value_line, 'x1': DERIVATIVES[-1] + 0.75, 'y1': _p_value_line, 'line': { 'color': 'red', 'width': 2, 'dash': 'dash' } } ] } ) save.to_html( _fig=_fig, _path=os.path.join(paths.PLOTS, save.get_module_name()), _filename='plot_' + _test_name )
def test_deprecation(self): with pytest.deprecated_call(): kpss(self.x, 'c', lags=None)
def main(_band=None, _high_temporal_resolution=True, _tuples_to_mark=None, _tuples_to_plot=None, _plots=None): if _plots is None: _plots = ['whiteness', 'granger'] _experiments = all_experiments() _experiments = filtering.by_categories( _experiments=_experiments, _is_single_cell=False, _is_high_temporal_resolution=_high_temporal_resolution, _is_bleb=False, _is_dead_dead=False, _is_live_dead=False, _is_bead=False, _is_metastasis=False) _tuples = load.experiments_groups_as_tuples(_experiments) _tuples = filtering.by_time_frames_amount(_tuples, _time_frames=MINIMUM_TIME_FRAMES) _tuples = filtering.by_pair_distance_range( _tuples, _distance_range=PAIR_DISTANCE_RANGE) _tuples = filtering.by_real_pairs(_tuples) _tuples = filtering.by_band(_tuples, _band=_band) print('Total tuples:', len(_tuples)) _arguments = [] for _tuple in _tuples: _experiment, _series_id, _group = _tuple _latest_time_frame = compute.latest_time_frame_before_overlapping( _experiment, _series_id, _group, OFFSET_X) for _cell_id in ['left_cell', 'right_cell']: _arguments.append({ 'experiment': _experiment, 'series_id': _series_id, 'group': _group, 'length_x': QUANTIFICATION_WINDOW_LENGTH_IN_CELL_DIAMETER, 'length_y': QUANTIFICATION_WINDOW_HEIGHT_IN_CELL_DIAMETER, 'length_z': QUANTIFICATION_WINDOW_WIDTH_IN_CELL_DIAMETER, 'offset_x': OFFSET_X, 'offset_y': OFFSET_Y, 'offset_z': OFFSET_Z, 'cell_id': _cell_id, 'direction': 'inside', 'time_points': _latest_time_frame }) _windows_dictionary, _windows_to_compute = compute.windows( _arguments, _keys=['experiment', 'series_id', 'group', 'cell_id']) _fiber_densities = compute.fiber_densities(_windows_to_compute, _subtract_border=True) _experiments_fiber_densities = { _key: [_fiber_densities[_tuple] for _tuple in _windows_dictionary[_key]] for _key in _windows_dictionary } _n_pairs = 0 _n_pairs_with_band = 0 _whiteness_p_values = [] _n_passed_whiteness_with_band = 0 _granger_causality_p_values = [] _n_passed_granger_causality_with_band = 0 _correlations = [] _time_lag_correlations = [] _end_fiber_densities = [] for _tuple in _tuples: _experiment, _series_id, _group = _tuple _left_cell_fiber_densities = \ _experiments_fiber_densities[(_experiment, _series_id, _group, 'left_cell')] _right_cell_fiber_densities = \ _experiments_fiber_densities[(_experiment, _series_id, _group, 'right_cell')] _properties = load.group_properties(_experiment, _series_id, _group) _left_cell_fiber_densities = compute.remove_blacklist( _experiment, _series_id, _properties['cells_ids']['left_cell'], _left_cell_fiber_densities) _right_cell_fiber_densities = compute.remove_blacklist( _experiment, _series_id, _properties['cells_ids']['right_cell'], _right_cell_fiber_densities) _left_cell_fiber_densities_filtered, _right_cell_fiber_densities_filtered = \ compute.longest_same_indices_shared_in_borders_sub_array( _left_cell_fiber_densities, _right_cell_fiber_densities) # ignore small arrays if len(_left_cell_fiber_densities_filtered) < MINIMUM_TIME_FRAMES: continue _n_pairs += 1 if _properties['band']: _n_pairs_with_band += 1 _start_time_frame = 0 for _left in _left_cell_fiber_densities: if _left[0] == _left_cell_fiber_densities_filtered[0]: break _start_time_frame += 1 # stationary test with warnings.catch_warnings(): warnings.simplefilter('ignore', category=InterpolationWarning) # find derivative for stationary for _derivative in range(10): _left_cell_fiber_densities_derivative = \ compute_lib.derivative(_left_cell_fiber_densities_filtered, _n=_derivative) _right_cell_fiber_densities_derivative = \ compute_lib.derivative(_right_cell_fiber_densities_filtered, _n=_derivative) if ADF_TEST: _, _left_cell_adf_p_value, _, _, _, _ = adfuller( _left_cell_fiber_densities_derivative) _, _right_cell_adf_p_value, _, _, _, _ = adfuller( _right_cell_fiber_densities_derivative) if _left_cell_adf_p_value > 0.05 or _right_cell_adf_p_value > 0.05: continue if KPSS_TEST: _, _left_cell_kpss_p_value, _, _ = kpss( _left_cell_fiber_densities_derivative, nlags='legacy') _, _right_cell_kpss_p_value, _, _ = kpss( _right_cell_fiber_densities_derivative, nlags='legacy') if _left_cell_kpss_p_value < 0.05 or _right_cell_kpss_p_value < 0.05: continue # stationary break # causality try: _x = pd.DataFrame(data=[[_left_value, _right_value] for _left_value, _right_value in zip( _left_cell_fiber_densities_derivative, _right_cell_fiber_densities_derivative) ], columns=['left', 'right']) # var model to retrieve lag _var_model = VAR(_x) _lag_order_results = _var_model.select_order() _estimators_lags = [ _lag_order_results.aic, _lag_order_results.bic, _lag_order_results.fpe, _lag_order_results.hqic ] _min_estimator_lag = min(_estimators_lags) # found a lag if 0 < _min_estimator_lag <= MAXIMUM_LAG: _var_model_results = _var_model.fit(maxlags=_min_estimator_lag, ic=None) _whiteness = _var_model_results.test_whiteness( nlags=_min_estimator_lag + 1) _whiteness_p_values.append(_whiteness.pvalue) if _tuples_to_mark is not None and _tuple in _tuples_to_mark and _whiteness.pvalue > 0.05: print(_tuple, 'marked whiteness p-value:', _whiteness.pvalue) # no autocorrelation in the residuals if _whiteness.pvalue > 0.05: if _properties['band']: _n_passed_whiteness_with_band += 1 # time lag = 0 _correlation = compute_lib.correlation( _left_cell_fiber_densities_derivative, _right_cell_fiber_densities_derivative) # if _correlation < 0.5: # continue # granger causality for _caused, _causing in zip(['left', 'right'], ['right', 'left']): _granger = _var_model_results.test_causality( caused=_caused, causing=_causing) _granger_causality_p_values.append(_granger.pvalue) # time lag = 0 _correlations.append(_correlation) # time lag = min estimator if _causing == 'left': _left_fiber_densities_time_lag = \ _left_cell_fiber_densities_derivative[:-_min_estimator_lag] _right_fiber_densities_time_lag = \ _right_cell_fiber_densities_derivative[_min_estimator_lag:] else: _left_fiber_densities_time_lag = \ _left_cell_fiber_densities_derivative[_min_estimator_lag:] _right_fiber_densities_time_lag = \ _right_cell_fiber_densities_derivative[:-_min_estimator_lag] _time_lag_correlation = compute_lib.correlation( _left_fiber_densities_time_lag, _right_fiber_densities_time_lag) _time_lag_correlations.append(_time_lag_correlation) # end fiber density _time_frame = compute.density_time_frame(_experiment) if len(_left_cell_fiber_densities_filtered ) > _time_frame: _end_fiber_density = \ (_left_cell_fiber_densities_filtered[_time_frame] + _right_cell_fiber_densities_filtered[_time_frame]) / 2 else: _end_fiber_density = \ (_left_cell_fiber_densities_filtered[-1] + _right_cell_fiber_densities_filtered[-1]) / 2 _normalization = load.normalization_series_file_data( _experiment, _series_id) _normalized_fiber_density = compute_lib.z_score( _end_fiber_density, _normalization['average'], _normalization['std']) _end_fiber_densities.append(_normalized_fiber_density) # marking if _tuples_to_mark is not None and _tuple in _tuples_to_mark and _granger.pvalue < 0.05: print(_tuple, 'causing:', _causing, 'marked granger p-value:', _granger.pvalue) if _granger.pvalue < 0.05: if _properties['band']: _n_passed_granger_causality_with_band += 1 _normality = _var_model_results.test_normality() _inst_granger = _var_model_results.test_inst_causality( causing=_causing) print( _tuple, _causing.capitalize() + ' causes ' + _caused + '!', 'time-points: ' + str(len( _left_cell_fiber_densities_derivative)), 'stationary derivative: ' + str(_derivative), 'band:' + str(_properties['band']), 'p-value: ' + str(round(_granger.pvalue, 4)), 'lag: ' + str(_min_estimator_lag), 'normality p-value: ' + str(round(_normality.pvalue, 4)), 'inst p-value: ' + str(round(_inst_granger.pvalue, 4)), sep='\t') # lag = 0 print('Time lag = 0 correlation:', _correlation) # rest of lags for _lag in range(1, _min_estimator_lag + 1): if _causing == 'left': _left_fiber_densities_time_lag = _left_cell_fiber_densities_derivative[: -_lag] _right_fiber_densities_time_lag = _right_cell_fiber_densities_derivative[ _lag:] else: _left_fiber_densities_time_lag = _left_cell_fiber_densities_derivative[ _lag:] _right_fiber_densities_time_lag = _right_cell_fiber_densities_derivative[: -_lag] _correlation = compute_lib.correlation( _left_fiber_densities_time_lag, _right_fiber_densities_time_lag) print( 'Time lag = ' + str(_lag) + ' correlation:', _correlation) # plots if _tuples_to_plot is not None and _tuple in _tuples_to_plot: _y_arrays = [ _left_cell_fiber_densities_derivative, _right_cell_fiber_densities_derivative ] _names_array = ['Left cell', 'Right cell'] _colors_array = config.colors(2) _temporal_resolution = compute.temporal_resolution_in_minutes( _experiment) _fig = go.Figure(data=[ go.Scatter(x=np.arange( start=_start_time_frame, stop=_start_time_frame + len(_left_cell_fiber_densities_derivative ), step=1) * _temporal_resolution, y=_y, name=_name, mode='lines', line={ 'color': _color, 'width': 1 }) for _y, _name, _color in zip( _y_arrays, _names_array, _colors_array) ], layout={ 'xaxis': { 'title': 'Time (minutes)', 'zeroline': False }, 'yaxis': { 'title': 'Fiber density (z-score)' + '\'' * _derivative, 'zeroline': False }, 'legend': { 'xanchor': 'left', 'x': 0.1, 'yanchor': 'top', 'bordercolor': 'black', 'borderwidth': 2, 'bgcolor': 'white' }, }) _experiment, _series_id, _group = _tuple save.to_html( _fig=_fig, _path=os.path.join(paths.PLOTS, save.get_module_name()), _filename='plot_' + _experiment + '_' + str(_series_id) + '_' + _group) # residuals _y_arrays = \ [_var_model_results.resid.values[:, 0], _var_model_results.resid.values[:, 1]] _fig = go.Figure(data=[ go.Scatter(x=np.arange( start=_start_time_frame, stop=_start_time_frame + len(_y), step=1) * _temporal_resolution, y=_y, name=_name, mode='lines', line={ 'color': _color, 'width': 1 }) for _y, _name, _color in zip( _y_arrays, _names_array, _colors_array) ], layout={ 'xaxis': { 'title': 'Time (minutes)', 'zeroline': False }, 'yaxis': { 'title': 'Residual', 'zeroline': False }, 'legend': { 'xanchor': 'left', 'x': 0.1, 'yanchor': 'top', 'bordercolor': 'black', 'borderwidth': 2, 'bgcolor': 'white' }, }) _experiment, _series_id, _group = _tuple save.to_html( _fig=_fig, _path=os.path.join(paths.PLOTS, save.get_module_name()), _filename='plot_residuals_' + _experiment + '_' + str(_series_id) + '_' + _group) # not enough time points except ValueError: continue print('Total pairs:', _n_pairs) print('Total pairs with band:', _n_pairs_with_band) print('Total pairs passed whiteness:', (np.array(_whiteness_p_values) > 0.05).sum()) print('Total pairs passed whiteness with band:', _n_passed_whiteness_with_band) print('Total cells passed granger causality:', (np.array(_granger_causality_p_values) < 0.05).sum()) print('Total cells passed granger causality with band:', _n_passed_granger_causality_with_band) # p-value correction print('Corrections of GC p-value < 0.05:') _granger_causality_p_values_corrected = multipletests( pvals=_granger_causality_p_values, method='fdr_bh') for _p_value, _p_value_corrected in zip( _granger_causality_p_values, _granger_causality_p_values_corrected[1]): if _p_value < 0.05: print('Original GC p-value:', _p_value, 'corrected:', _p_value_corrected) # plots for _test_name, _y_title, _y_array in \ zip( ['whiteness', 'granger'], ['Whiteness p-value', 'Granger causality p-value'], [_whiteness_p_values, _granger_causality_p_values] ): if _test_name in _plots: _fig = go.Figure(data=go.Box(y=_y_array, boxpoints='all', jitter=1, pointpos=0, line={'width': 1}, fillcolor='white', marker={ 'size': 10, 'color': '#ea8500' }, opacity=0.7, showlegend=False), layout={ 'xaxis': { 'zeroline': False }, 'yaxis': { 'title': _y_title, 'zeroline': False, 'range': [-0.1, 1.1], 'tickmode': 'array', 'tickvals': [0.05, 1] }, 'shapes': [{ 'type': 'line', 'x0': -0.75, 'y0': 0.05, 'x1': 0.75, 'y1': 0.05, 'line': { 'color': 'red', 'width': 2, 'dash': 'dash' } }] }) save.to_html(_fig=_fig, _path=os.path.join(paths.PLOTS, save.get_module_name()), _filename='plot_' + _test_name) # granger versus correlation print( 'GC vs. correlation pearson correlation:', compute_lib.correlation(_granger_causality_p_values, _correlations, _with_p_value=True)) _fig = go.Figure(data=go.Scatter(x=_granger_causality_p_values, y=_correlations, mode='markers', marker={ 'size': 10, 'color': '#ea8500' }, showlegend=False), layout={ 'xaxis': { 'title': 'Granger causality p-value', 'zeroline': False, }, 'yaxis': { 'title': 'Inner correlation', 'zeroline': False, } }) save.to_html(_fig=_fig, _path=os.path.join(paths.PLOTS, save.get_module_name()), _filename='plot_gc_vs_correlation') # granger versus time lag correlation print( 'GC vs. time lag correlation pearson correlation:', compute_lib.correlation(_granger_causality_p_values, _time_lag_correlations, _with_p_value=True)) _fig = go.Figure(data=go.Scatter(x=_granger_causality_p_values, y=_time_lag_correlations, mode='markers', marker={ 'size': 10, 'color': '#ea8500' }, showlegend=False), layout={ 'xaxis': { 'title': 'Granger causality p-value', 'zeroline': False, }, 'yaxis': { 'title': 'GC lag inner correlation', 'zeroline': False, } }) save.to_html(_fig=_fig, _path=os.path.join(paths.PLOTS, save.get_module_name()), _filename='plot_gc_vs_time_lag_correlation') # granger versus end fiber density print( 'GC vs. end fiber density pearson correlation:', compute_lib.correlation(_granger_causality_p_values, _end_fiber_densities, _with_p_value=True)) _fig = go.Figure(data=go.Scatter(x=_granger_causality_p_values, y=_end_fiber_densities, mode='markers', marker={ 'size': 10, 'color': '#ea8500' }, showlegend=False), layout={ 'xaxis': { 'title': 'Granger causality p-value', 'zeroline': False, }, 'yaxis': { 'title': 'End fiber density (z-score)', 'zeroline': False, } }) save.to_html(_fig=_fig, _path=os.path.join(paths.PLOTS, save.get_module_name()), _filename='plot_gc_vs_end_density')
df1 = df1.set_index(df1['application_date']) df2 = df2.set_index(df2['application_date']) #Grouping the data as per the requirement: df1 = df1.groupby('segment').resample('D').case_count.sum() df2 = df2.groupby('segment').resample('D').case_count.sum() #Checking if the given series is stationary for segement 1: df1 = pd.DataFrame(df1) rol_mean = df1['case_count'].rolling(30).mean() X = df1['case_count'].values x_adf = adfuller( X ) #From ADFuller test(test static < critical value, reject H0). Series is stationary x_kpss = kpss( X ) #From KPSS test(test static > critical value, reject H0). Series is non stationary #print(x_adf) #print(x_kpss) X_1 = df1.diff(1).dropna() rol_mean1 = X_1['case_count'].rolling(30).mean() x_1_adf = adfuller(X_1['case_count'].values) x_1_kpss = kpss(X_1['case_count'].values) #print(x_1_adf) #print(x_1_kpss) #Plotting to check for stationarity: fig, ax = plt.subplots(2, 2) ax[0, 0].plot(df1['case_count'].tolist()) ax[0, 0].plot(rol_mean.tolist(), color='red')
plt.show() # Coefficients values for lag>5 are statistically not significant and their impact on the model is minimal, except a few spikes at 8,11,22 and beyond. # <a id="subsection-four"></a> # # KPSS Test # # The KPSS test, short for, Kwiatkowski-Phillips-Schmidt-Shin (KPSS), is a type of Unit root test that tests for the stationarity of a given series around a deterministic trend. # # Here, the null hypothesis is that the series is **stationary**. # # That is, if p-value is < signif level (say 0.05), then the series is non-stationary and vice versa. # In[31]: stats, p, lags, critical_values = kpss(series, 'ct') # In[32]: print(f'Test Statistics : {stats}') print(f'p-value : {p}') print(f'Critical Values : {critical_values}') if p < 0.05: print('Series is not Stationary') else: print('Series is Stationary') # # Interpreting KPSS test results # # The output of the KPSS test contains 4 things:
def simple_auto_stationarize( df, verbosity=None, alpha=None, multitest=None, get_conclusions=False, get_actions=False, ): """Auto-stationarize the given time-series dataframe. Parameters ---------- df : pandas.DataFrame A dataframe composed solely of numeric columns. verbosity : int, logging.Logger, optional If an int is given, it is interpreted as the logging lever to use. See https://docs.python.org/3/library/logging.html#levels for details. If a logging.Logger object is given, it is used for printing instead, with appropriate logging levels. If no value is provided, the default logging.Logger behaviour is used. alpha : int, optional Family-wise error rate (FWER) or false discovery rate (FDR), depending on the method used for multiple hypothesis testing error control. If no value is provided, a default value of 0.05 (5%) is used. multitest : str, optional The multiple hypothesis testing eror control method to use. If no value is provided, the Benjamini–Yekutieli is used. See `the documesimple_auto_stationarizentation of statsmodels' multipletests method for supported values <https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html>`. get_conclusions : bool, defaults to False If set to true, a conclusions dict is returned. get_actions : bool, defaults to False If set to true, an actions dict is returned. Returns ------- results : pandas.DataFrame or dict By default, only he transformed dataframe is returned. However, if get_conclusions or get_actions are set to True, a dict is returned instead, with the following mappings: - `postdf` - Maps to the transformed dataframe. - `conclusions` - Maps to a dict mapping each column name to the arrived conclusion regarding its stationarity. - `actions` - Maps to a dict mapping each column name to the transformations performed on it to stationarize it. """ # noqa: E501 if verbosity is not None: prev_verbosity = set_verbosity_level(verbosity) if alpha is None: alpha = DEF_ALPHA logger = get_logger() logger.info("Starting to auto-stationarize a dataframe!") logger.info("Starting to check input data validity...") logger.info(f"Data shape (time, variables) is {df.shape}.") # the first axis - rows - is expected to represent the time dimension, # while the second axis - columns - is expected to represent variables; # thus, the first expected to be much longer than the second logger.info( "Checking current data orientation (rows=time, columns=variables)...") if df.shape[1] >= df.shape[0]: logger.warning( ("stationarizer's input dataframe has more columns than rows! " "Columns are expected to represent variables, while rows " "represent time steps, and thus the input dataframe is " "expected to have more rows than columns. Either the input " "data is inverted, or the data has far more variables than " "samples.")) else: logger.info("Data orientation is valid.") # assert all columns are numeric all_cols_numeric = all([np.issubdtype(x, np.number) for x in df.dtypes]) if not all_cols_numeric: err = ValueError( "All columns of stationarizer's input dataframe must be numeric!") logger.exception(err) # util var n = len(df.columns) # testing for unit root logger.info(("Checking for the presence of a unit root in the input time " "series using the Augmented Dicky-Fuller test")) logger.info( ("Reminder:\n " "Null Hypothesis: The series has a unit root (value of a=1); " "meaning, it is NOT stationary.\n" "Alternate Hypothesis: The series has no unit root; it is either " "stationary or non-stationary of a different model than unit root.")) adf_results = [] for colname in df.columns: srs = df[colname] result = adfuller(srs, regression="ct") logger.info( (f"{colname}: test statistic={result[0]}, p-val={result[1]}.")) adf_results.append(result) # testing for trend stationarity logger.info(( "Testing for trend stationarity of input series using the KPSS test.")) logger.info(("Reminder:\n" "Null Hypothesis (H0): The series is trend-stationarity.\n" "Alternative Hypothesis (H1): The series has a unit root.")) kpss_results = [] for colname in df.columns: srs = df[colname] result = kpss(srs, regression="ct") logger.info( (f"{colname}: test statistic={result[0]}, p-val={result[1]}.")) kpss_results.append(result) # Controling FDR logger.info( ("Controling the False Discovery Rate (FDR) using the Benjamini-" f"Yekutieli procedure with α={DEF_ALPHA}.")) adf_pvals = [x[1] for x in adf_results] kpss_pvals = [x[1] for x in kpss_results] pvals = adf_pvals + kpss_pvals by_res = multipletests(pvals=pvals, alpha=alpha, method="fdr_by", is_sorted=False) reject = by_res[0] corrected_pvals = by_res[1] adf_rejections = reject[:n] kpss_rejections = reject[n:] adf_corrected_pvals = corrected_pvals[:n] # noqa: F841 kpss_corrected_pvals = corrected_pvals[n:] # noqa: F841 conclusion_counts = {} def dict_inc(dicti, key): try: dicti[key] += 1 except KeyError: dicti[key] = 1 # interpret results logger.info("Interpreting test results after FDR control...") conclusions = {} actions = {} for i, colname in enumerate(df.columns): conclusion = conclude_adf_and_kpss_results( adf_reject=adf_rejections[i], kpss_reject=kpss_rejections[i]) dict_inc(conclusion_counts, conclusion) trans = CONCLUSION_TO_TRANSFORMATIONS[conclusion] conclusions[colname] = conclusion actions[colname] = trans logger.info((f"--{colname}--\n " f"ADF corrected p-val: {adf_corrected_pvals[i]}, " f"H0 rejected: {adf_rejections[i]}.\n" f"KPSS corrected p-val: {kpss_corrected_pvals[i]}, " f"H0 rejected: {kpss_rejections[i]}.\n" f"Conclusion: {conclusion}\n Transformations: {trans}.")) # making non-stationary series stationary! logger.info((f"Pre-transformation shape: {df.shape}, " f"#NA: {df.isna().sum().sum()}")) post_cols = {} logger.info("Applying transformations...") for colname in df.columns: srs = df[colname] if Transformation.DETREND in actions[colname]: logger.info(f"Detrending {colname} (len={len(srs)}).") srs = detrend(srs, order=1, axis=0) logger.debug(f"# NaN after detrending: {np.isnan(srs).sum()}") if Transformation.DIFFRENTIATE in actions[colname]: logger.info(f"Diffrentiating {colname} (len={len(srs)}).") srs = diff(srs, k_diff=1) logger.debug(f"# NaN after diffrencing: {np.isnan(srs).sum()}") post_cols[colname] = srs logger.info(f"{colname} transformed (len={len(post_cols[colname])}).") # equalizing lengths min_len = min([len(post_cols[x]) for x in post_cols]) logger.info(f"Min length to trim to: {min_len}") trimmed_cols = {} for colname in df.columns: col = post_cols[colname][:min_len].values trimmed_cols[colname] = col logger.debug( f"#NA trimmed {colname} (len={len(col)}): {np.isnan(col).sum()}") postdf = pd.DataFrame.from_dict(trimmed_cols) # postdf = postdf[:min_len] logger.debug(f"trimmed df shape: {postdf.shape}") postdf.index = df.index.copy()[:min_len] # postdf = df.copy() # postdf = postdf.iloc[:min_len] # for colname in df.columns: # postdf[colname] = post_cols[colname] logger.info(f"Post trimming shape: {postdf.shape}") # checking for NaNs nan_count = postdf.isna().sum().sum() if nan_count > 0: nan_rows = postdf[postdf.isna().any(axis=1)] logger.debug(f"Post trimming NaN count: {nan_count}") logger.debug(f"Rows with Nan values:\n {nan_rows}") for k in conclusion_counts: count = conclusion_counts[k] ratio = 100 * (count / len(df.columns)) logger.info(f"{count} series ({ratio}%) found with conclusion: {k}.") if verbosity is not None: set_verbosity_level(prev_verbosity) if not get_actions and not get_conclusions: return postdf results = {"postdf": postdf} if get_conclusions: results["conclusions"] = conclusions if get_actions: results["actions"] = actions return results