def storeOLSPrediction(): data = list( IgnitionRow.objects.all().order_by('-pub_date')[:1].values()) two_hours = data[::-1] data = pd.DataFrame(two_hours) data['pub_date'] = data.apply(lambda x: str(x['pub_date']), axis=1) data['pub_date_struct'] = data.apply( lambda x: time.strptime(x['pub_date'], "%Y-%m-%d %H:%M:%S.%f%z"), axis=1) data.index = data.apply( lambda x: datetime.fromtimestamp(mktime(x['pub_date_struct'])), axis=1) data['hour'] = data.apply(lambda x: str( time.strptime(x['pub_date'], "%Y-%m-%d %H:%M:%S.%f%z")[3]), axis=1) data['day_of_week'] = data.index.map(lambda x: x.weekday()) data['hour'] = pd.Categorical(data['hour'], categories=list(range(24))) data['day_of_week'] = pd.Categorical(data['day_of_week'], categories=list(range(7))) hour_dummies = pd.get_dummies(data['hour'], drop_first=True) hour_dummies.columns = [ 'h' + str(elem) for elem in hour_dummies.columns ] day_of_week_dummies = pd.get_dummies(data['day_of_week'], drop_first=True) day_of_week_dummies.columns = [ 'dow' + str(elem) for elem in day_of_week_dummies.columns ] data = pd.concat((data, hour_dummies, day_of_week_dummies), axis=1) results5 = OLSResults.load("regression_models/ols_9_21_data_5.pickle") results25 = OLSResults.load( "regression_models/ols_9_21_data_25.pickle") results50 = OLSResults.load( "regression_models/ols_9_21_data_50.pickle") results200 = OLSResults.load( "regression_models/ols_9_21_data_200.pickle") results500 = OLSResults.load( "regression_models/ols_9_21_data_500.pickle") preds5 = results5.predict(data) preds25 = results25.predict(data) preds50 = results50.predict(data) preds200 = results200.predict(data) preds500 = results500.predict(data) preds = [preds5, preds25, preds50, preds200, preds500] print("OLS PREDICTIONS: {}".format(preds)) d = IgnitionRowPredictionOLS(num_players_5=preds[0], num_players_25=preds[1], num_players_50=preds[2], num_players_200=preds[3], num_players_500=preds[4], pub_date=timezone.now()) d.save()
def main(models): #models is the list of model names r_squareds = [] coefficients = [] standerd_errors = [] residuals = [] for model in models: results = OLSResults.load( f'regression_models/return_predictions/{coin}/{model}') r_squareds.append(results.rsquared) coefficients.append(tuple(results.params)) standerd_errors.append(tuple(results.bse)) residuals.append(results.df_resid) # model_dict.update({model, [results.rsquared, results.params,results.bse]}) model_dict = { 'r_squareds': r_squareds, 'coefficients': coefficients, 'standerd_errors': standerd_errors, 'residuals': residuals } print(model_dict) model_df = pd.DataFrame(model_dict, index=models).sort_values('r_squareds', ascending=False) print(model_df) model_df.to_csv(f'results/return_predictions/{coin}_model_results.csv') print('model saved to "model_df.csv"')
def unpickle(self): pkl_file = open('uuid_to_key.pickle', 'rb') self.uuid_to_key = pickle.load(pkl_file) pkl_file.close() pkl_file = open('key_to_uuid.pickle', 'rb') self.key_to_uuid = pickle.load(pkl_file) pkl_file.close() for name in self.allModelName: key = name[:-7] key = self.uuid_to_key[key] model = OLSResults.load('model/' + name) self.models[key] = model
def unpickle(self): for name in self.allModelName: key = name[:-7] model = OLSResults.load('model/' + name) self.models[key] = model
def OLS_realtime(X_test, OLS_name): OLS_name = '../datasets.nosync/' + OLS_name linear_results = OLSResults.load(OLS_name) ols_predict = linear_results.predict(X_test) return (ols_predict)
sd = setup_data(gf, instrument=instrument, pricediff=True, log=True, trading=True) sd.head() sd['intercept'] = 1 models = {} models['HHLL_LogDiff USD_JPY_highMid-1'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid-1.h5') models['HHLL_LogDiff USD_JPY_highMid-2'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid-2.h5') models['HHLL_LogDiff USD_JPY_highMid-3'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid-3.h5') models['HHLL_LogDiff USD_JPY_highMid-4'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid-4.h5') models['HHLL_LogDiff USD_JPY_highMid-5'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid-5.h5') models['HHLL_LogDiff USD_JPY_highMid-6'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid-6.h5') models['HHLL_LogDiff USD_JPY_lowMid-1'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_lowMid-1.h5') models['HHLL_LogDiff USD_JPY_lowMid-2'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_lowMid-2.h5') models['HHLL_LogDiff USD_JPY_lowMid-3'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_lowMid-3.h5') models['HHLL_LogDiff USD_JPY_lowMid-4'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_lowMid-4.h5') models['HHLL_LogDiff USD_JPY_lowMid-5'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_lowMid-5.h5') models['HHLL_LogDiff USD_JPY_lowMid-6'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_lowMid-6.h5') models['HHLL_LogDiff USD_JPY_highMid0'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid0.h5') models['HHLL_LogDiff USD_JPY_highMid1'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid1.h5')
def load_model(self, filename): filepath = path.join(self.PICKLES_PATH, filename) with open(filepath, 'rb') as f: model = OLSResults.load(f) return model
plt.plot(df_rates["RPC1"] / df_rates["RPC2"]) plt.show() plt.savefig('Rates_ratio_' + str(aRun) + '_' + args.do_fit + '.pdf') plt.close() #X=df_rates[['DT1','DT2','DT3','DT4']] X = df_rates[['RPC2', 'RPC3', 'RPC4']] y = df_rates['RPC1'] if fit is True: results = smf.ols('RPC1 ~ RPC2 + RPC3 + RPC4', df_rates).fit() results.save("model.pickle") else: from statsmodels.regression.linear_model import OLSResults results = OLSResults.load("model.pickle") print(results.summary()) res = results.predict(X) #print(res) #print(y) xy = np.vstack([y, res]) z = gaussian_kde(xy)(xy) fig, ax = plt.subplots() sc = ax.scatter(y, res, c=z, s=100, edgecolor='') plt.title('Predicted vs measured RPC rate') plt.ylabel('Predicted rate') plt.xlabel('Measured rate')
def __init__(self, model_file_name, test_file_name): self.model_file_name = model_file_name self.test_file_name = test_file_name self.model = OLSResults.load(self.model_file_name) self.testing_set = pd.read_csv(self.test_file_name) self.prediction = []
import statsmodels.api as sm from statsmodels.regression.linear_model import OLSResults import numpy as np nsample = 100 x = np.linspace(0, 10, 100) X = np.column_stack((x, x**2)) beta = np.array([1, 0.1, 10]) e = np.random.normal(size=nsample) X = sm.add_constant(X) y = np.dot(X, beta) + e model = sm.OLS(y, X) results = model.fit() results.save("example.pickle") new_results = OLSResults.load("example.pickle") print(new_results.summary())
def calc_granger_causality(x, diff_x, granger_list, group_var, groups, maxlag, both_sides=False, only_min_crit=False, filter_p_value=None): """Computes Granger Causality test for two given time series. Takes into consideration time series differencing on the basis of the table provided in diff_x""" """ PARAMETERS: 1) x - input data containing chosen time series 2) diff_x - table containing number of times the time series should be differentiated to make it stationary 3) granger_list - list of two-element tuples. Each tuple contains names of: a) first elememt - the name of the time series which is a potential result in Granger casuality (Y) b) second elememt - the name of the time series which is a potential cause in Granger casuality (X) i.e. the structure of the tuple is (Y, X) 4) group_var - grouping variable in `x` on the basis of which time series named in `granger_list` should be divided 5) groups - values of `group_var` for which Granger causality should be checked 6) maxlag - maximum number of lags for which Granger causality should be checked 7) both_sides - a boolean value indicating whether to check if potential causes (X's) are not the results of potential resulting variables (i.e. X~Y is checked if both_sided = True) 8) only_min_crit - a boolean value indicating whether only observations with minimum values of AIC and BIC criteria should be reported (True if only such observations should be reported) 9) filter_p_value - significance level of Granger's causality test below which the results should be reported; by default all results are reported; if you e.g. want to obtain only the results significant on significance level alpha = .05, provide filter_p_value = .05 """ # x = data_m_min_date # diff_x = diff_req # granger_list = GRANGER_LIST # group_var = 'tech' # groups = ['tensorflow'] # maxlag = 8 # both_sides = True # only_min_crit = False # filter_p_value = None import pandas as pd import numpy as np from useful import repeated from grangercausalitytests_mod import grangercausalitytests_mod from statsmodels.regression.linear_model import OLSResults if both_sides == True: granger_list_reversed = [x[::-1] for x in granger_list] granger_list = granger_list.copy() granger_list.extend(granger_list_reversed) len_granger_list = len(granger_list) if only_min_crit: results = pd.DataFrame(columns=[ 'ID', 'group', 'y', 'y_diff', 'x', 'x_diff', 'lag', 'p_value', 'AIC', 'BIC', 'min_AIC', 'min_BIC' ]) else: results = pd.DataFrame(columns=[ 'ID', 'group', 'y', 'y_diff', 'x', 'x_diff', 'lag', 'p_value', 'AIC', 'BIC' ]) for g in groups: for i, gl in enumerate(granger_list): # g = 'tensorflow' # i = 1 # gl = ('hn_all_match_score', 'so_usage_cnt') if both_sides: # reversed pairs if i >= len_granger_list / 2: i = i - len_granger_list / 2 # ID of reversed pair equals # to the ID of the original pair: # ID(Y~X) = ID(X~Y) yvar_diffs = int(diff_x.at[g, gl[0]]) yvar = repeated(pd.DataFrame.diff, yvar_diffs)(x[x[group_var] == g][gl[0]]) # if len(yvar) <= 3 * maxlag + 1: # +1 = constant # warn("The maximum lag was too large") # else: xvar_diffs = int(diff_x.at[g, gl[1]]) xvar = repeated(pd.DataFrame.diff, xvar_diffs)(x[x[group_var] == g][gl[1]]) gstats = grangercausalitytests_mod( (pd.concat([yvar, xvar], axis=1).dropna()), maxlag=maxlag, verbose=False) len_gstats = len(gstats) # print(maxlag) # print(len_gstats) df = pd.DataFrame({ 'ID': np.repeat(i, len_gstats), 'group': np.repeat(g, len_gstats), 'y': np.repeat(gl[0], len_gstats), 'y_diff': np.repeat(yvar_diffs, len_gstats), 'x': np.repeat(gl[1], len_gstats), 'x_diff': np.repeat(xvar_diffs, len_gstats), 'lag': range(1, len_gstats + 1), 'p_value': [ gstats[x][0]['ssr_ftest'][1] for x in range(1, len_gstats + 1) ], 'AIC': [ OLSResults.aic(gstats[x][1][1]) for x in range(1, len_gstats + 1) ], 'BIC': [ OLSResults.bic(gstats[x][1][1]) for x in range(1, len_gstats + 1) ], }) if only_min_crit: min_AIC = df.loc[df.AIC == min(df.AIC)] min_BIC = df.loc[df.BIC == min(df.BIC)] if min_AIC.equals(min_BIC): min_AIC = min_AIC.assign(min_AIC=True) min_AIC = min_AIC.assign(min_BIC=True) df = min_AIC else: min_AIC = min_AIC.assign(min_AIC=True) min_AIC = min_AIC.assign(min_BIC=False) min_BIC = min_BIC.assign(min_AIC=False) min_BIC = min_BIC.assign(min_BIC=True) df = pd.concat([min_AIC, min_BIC], axis=0) results = (pd.concat( [results, df], axis=0).sort_values(by=['group', 'ID']).reset_index(drop=True)) if filter_p_value is not None: return (results[results.p_value < filter_p_value]) else: return (results) ### End of code
pred5.corr(wcat["AT"]) #plt.scatter(x=wcat["Waist"], y=wcat["AT"], color="blue");plt.xlabel="Waist";plt.ylabel="AT" #plt.plot(wcat["Waist"], pred5, color="red") res5 = wcat.AT - pred5 sqres5 = res5 * res5 mse5 = np.mean(sqres5) rmse5 = np.sqrt(mse5) #from sklearn.linear_model import LinearRegression model5.save("slr_wcat.pkl") from statsmodels.regression.linear_model import OLSResults model = OLSResults.load("slr_wcat.pkl") #type(new_results) # saving model to disk #pickle.dump(model5, open("slr_wcat.pkl","wb")) # loading model to compare results #slr_wcat = pickle.load(open("slr_wcat.pkl", "rb")) x = np.exp( model.predict( pd.DataFrame([[36, 1296, 46656]], columns=["Waist", "Waist_sq", "Waist_cb"]))) print(float(round(x, 2))) #print(round(np.exp(model5.predict(pd.DataFrame([[80,6400,512000]], columns=["Waist", "Waist_sq", "Waist_cb"]))),2))
def calc_granger_causality(x, diff_x, granger_list, group_var, groups, maxlag, both_sides = False, only_min_crit = False, filter_p_value = None): """Computes Granger Causality test for two given time series. Takes into consideration time series differencing on the basis of the table provided in diff_x""" """ PARAMETERS: 1) x - input data containing chosen time series 2) diff_x - table containing number of times the time series should be differentiated to make it stationary 3) granger_list - list of two-element tuples. Each tuple contains names of: a) first elememt - the name of the time series which is a potential result in Granger casuality (Y) b) second elememt - the name of the time series which is a potential cause in Granger casuality (X) i.e. the structure of the tuple is (Y, X) 4) group_var - grouping variable in `x` on the basis of which time series named in `granger_list` should be divided 5) groups - values of `group_var` for which Granger causality should be checked 6) maxlag - maximum number of lags for which Granger causality should be checked 7) both_sides - a boolean value indicating whether to check if potential causes (X's) are not the results of potential resulting variables (i.e. X~Y is checked if both_sided = True) 8) only_min_crit - a boolean value indicating whether only observations with minimum values of AIC and BIC criteria should be reported (True if only such observations should be reported) 9) filter_p_value - significance level of Granger's causality test below which the results should be reported; by default all results are reported; if you e.g. want to obtain only the results significant on significance level alpha = .05, provide filter_p_value = .05 """ # x = data_m_min_date # diff_x = diff_req # granger_list = GRANGER_LIST # group_var = 'tech' # groups = ['tensorflow'] # maxlag = 8 # both_sides = True # only_min_crit = False # filter_p_value = None import pandas as pd import numpy as np from useful import repeated from grangercausalitytests_mod import grangercausalitytests_mod from statsmodels.regression.linear_model import OLSResults if both_sides == True: granger_list_reversed = [x[::-1] for x in granger_list] granger_list = granger_list.copy() granger_list.extend(granger_list_reversed) len_granger_list = len(granger_list) if only_min_crit: results = pd.DataFrame(columns=['ID', 'group', 'y', 'y_diff', 'x', 'x_diff', 'lag', 'p_value', 'AIC', 'BIC', 'min_AIC', 'min_BIC']) else: results = pd.DataFrame(columns=['ID', 'group', 'y', 'y_diff', 'x', 'x_diff','lag', 'p_value', 'AIC', 'BIC']) for g in groups: for i, gl in enumerate(granger_list): # g = 'tensorflow' # i = 1 # gl = ('hn_all_match_score', 'so_usage_cnt') if both_sides: # reversed pairs if i>=len_granger_list/2: i = i - len_granger_list/2 # ID of reversed pair equals # to the ID of the original pair: # ID(Y~X) = ID(X~Y) yvar_diffs = int(diff_x.at[g, gl[0]]) yvar = repeated(pd.DataFrame.diff, yvar_diffs)(x[x[group_var] == g][gl[0]]) # if len(yvar) <= 3 * maxlag + 1: # +1 = constant # warn("The maximum lag was too large") # else: xvar_diffs = int(diff_x.at[g, gl[1]]) xvar = repeated(pd.DataFrame.diff, xvar_diffs)(x[x[group_var] == g][gl[1]]) gstats = grangercausalitytests_mod( (pd.concat([yvar, xvar], axis=1).dropna()), maxlag = maxlag, verbose = False ) len_gstats = len(gstats) # print(maxlag) # print(len_gstats) df = pd.DataFrame( {'ID': np.repeat(i, len_gstats), 'group': np.repeat(g, len_gstats), 'y': np.repeat(gl[0], len_gstats), 'y_diff': np.repeat(yvar_diffs, len_gstats), 'x': np.repeat(gl[1], len_gstats), 'x_diff': np.repeat(xvar_diffs, len_gstats), 'lag': range(1, len_gstats+1), 'p_value': [gstats[x][0]['ssr_ftest'][1] for x in range(1, len_gstats+1)], 'AIC': [OLSResults.aic(gstats[x][1][1]) for x in range(1, len_gstats+1)], 'BIC': [OLSResults.bic(gstats[x][1][1]) for x in range(1, len_gstats+1)], } ) if only_min_crit: min_AIC = df.loc[df.AIC == min(df.AIC)] min_BIC = df.loc[df.BIC == min(df.BIC)] if min_AIC.equals(min_BIC): min_AIC = min_AIC.assign(min_AIC = True) min_AIC = min_AIC.assign(min_BIC = True) df = min_AIC else: min_AIC = min_AIC.assign(min_AIC = True) min_AIC = min_AIC.assign(min_BIC = False) min_BIC = min_BIC.assign(min_AIC = False) min_BIC = min_BIC.assign(min_BIC = True) df = pd.concat([min_AIC, min_BIC], axis = 0) results = (pd.concat([results, df], axis = 0) .sort_values(by = ['group', 'ID']) .reset_index(drop=True)) if filter_p_value is not None: return(results[results.p_value<filter_p_value]) else: return(results) ### End of code