Example #1
0
 def storeOLSPrediction():
     data = list(
         IgnitionRow.objects.all().order_by('-pub_date')[:1].values())
     two_hours = data[::-1]
     data = pd.DataFrame(two_hours)
     data['pub_date'] = data.apply(lambda x: str(x['pub_date']), axis=1)
     data['pub_date_struct'] = data.apply(
         lambda x: time.strptime(x['pub_date'], "%Y-%m-%d %H:%M:%S.%f%z"),
         axis=1)
     data.index = data.apply(
         lambda x: datetime.fromtimestamp(mktime(x['pub_date_struct'])),
         axis=1)
     data['hour'] = data.apply(lambda x: str(
         time.strptime(x['pub_date'], "%Y-%m-%d %H:%M:%S.%f%z")[3]),
                               axis=1)
     data['day_of_week'] = data.index.map(lambda x: x.weekday())
     data['hour'] = pd.Categorical(data['hour'], categories=list(range(24)))
     data['day_of_week'] = pd.Categorical(data['day_of_week'],
                                          categories=list(range(7)))
     hour_dummies = pd.get_dummies(data['hour'], drop_first=True)
     hour_dummies.columns = [
         'h' + str(elem) for elem in hour_dummies.columns
     ]
     day_of_week_dummies = pd.get_dummies(data['day_of_week'],
                                          drop_first=True)
     day_of_week_dummies.columns = [
         'dow' + str(elem) for elem in day_of_week_dummies.columns
     ]
     data = pd.concat((data, hour_dummies, day_of_week_dummies), axis=1)
     results5 = OLSResults.load("regression_models/ols_9_21_data_5.pickle")
     results25 = OLSResults.load(
         "regression_models/ols_9_21_data_25.pickle")
     results50 = OLSResults.load(
         "regression_models/ols_9_21_data_50.pickle")
     results200 = OLSResults.load(
         "regression_models/ols_9_21_data_200.pickle")
     results500 = OLSResults.load(
         "regression_models/ols_9_21_data_500.pickle")
     preds5 = results5.predict(data)
     preds25 = results25.predict(data)
     preds50 = results50.predict(data)
     preds200 = results200.predict(data)
     preds500 = results500.predict(data)
     preds = [preds5, preds25, preds50, preds200, preds500]
     print("OLS PREDICTIONS: {}".format(preds))
     d = IgnitionRowPredictionOLS(num_players_5=preds[0],
                                  num_players_25=preds[1],
                                  num_players_50=preds[2],
                                  num_players_200=preds[3],
                                  num_players_500=preds[4],
                                  pub_date=timezone.now())
     d.save()
def main(models):
    #models is the list of model names
    r_squareds = []
    coefficients = []
    standerd_errors = []
    residuals = []

    for model in models:

        results = OLSResults.load(
            f'regression_models/return_predictions/{coin}/{model}')
        r_squareds.append(results.rsquared)
        coefficients.append(tuple(results.params))
        standerd_errors.append(tuple(results.bse))
        residuals.append(results.df_resid)

        # model_dict.update({model, [results.rsquared, results.params,results.bse]})

    model_dict = {
        'r_squareds': r_squareds,
        'coefficients': coefficients,
        'standerd_errors': standerd_errors,
        'residuals': residuals
    }

    print(model_dict)

    model_df = pd.DataFrame(model_dict,
                            index=models).sort_values('r_squareds',
                                                      ascending=False)

    print(model_df)

    model_df.to_csv(f'results/return_predictions/{coin}_model_results.csv')
    print('model saved to "model_df.csv"')
	def unpickle(self):
		pkl_file = open('uuid_to_key.pickle', 'rb')
		self.uuid_to_key = pickle.load(pkl_file)
		pkl_file.close()
		pkl_file = open('key_to_uuid.pickle', 'rb')
		self.key_to_uuid = pickle.load(pkl_file)
		pkl_file.close()
		for name in self.allModelName:
			key = name[:-7]
			key = self.uuid_to_key[key]
			model = OLSResults.load('model/' + name)
			self.models[key] = model
Example #4
0
	def unpickle(self):
		for name in self.allModelName:
			key = name[:-7]
			model = OLSResults.load('model/' + name)
			self.models[key] = model
def OLS_realtime(X_test, OLS_name):
    OLS_name = '../datasets.nosync/' + OLS_name
    linear_results = OLSResults.load(OLS_name)
    ols_predict = linear_results.predict(X_test)
    return (ols_predict)
Example #6
0
sd = setup_data(gf,
                instrument=instrument,
                pricediff=True,
                log=True,
                trading=True)

sd.head()

sd['intercept'] = 1




models = {}

models['HHLL_LogDiff USD_JPY_highMid-1'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid-1.h5')
models['HHLL_LogDiff USD_JPY_highMid-2'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid-2.h5')
models['HHLL_LogDiff USD_JPY_highMid-3'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid-3.h5')
models['HHLL_LogDiff USD_JPY_highMid-4'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid-4.h5')
models['HHLL_LogDiff USD_JPY_highMid-5'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid-5.h5')
models['HHLL_LogDiff USD_JPY_highMid-6'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid-6.h5')

models['HHLL_LogDiff USD_JPY_lowMid-1'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_lowMid-1.h5')
models['HHLL_LogDiff USD_JPY_lowMid-2'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_lowMid-2.h5')
models['HHLL_LogDiff USD_JPY_lowMid-3'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_lowMid-3.h5')
models['HHLL_LogDiff USD_JPY_lowMid-4'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_lowMid-4.h5')
models['HHLL_LogDiff USD_JPY_lowMid-5'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_lowMid-5.h5')
models['HHLL_LogDiff USD_JPY_lowMid-6'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_lowMid-6.h5')

models['HHLL_LogDiff USD_JPY_highMid0'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid0.h5')
models['HHLL_LogDiff USD_JPY_highMid1'] = OLSResults.load('./src/models/HHLL_LogDiff USD_JPY_highMid1.h5')
 def load_model(self, filename):
     filepath = path.join(self.PICKLES_PATH, filename)
     with open(filepath, 'rb') as f:
         model = OLSResults.load(f)
     return model
Example #8
0
plt.plot(df_rates["RPC1"] / df_rates["RPC2"])
plt.show()
plt.savefig('Rates_ratio_' + str(aRun) + '_' + args.do_fit + '.pdf')
plt.close()

#X=df_rates[['DT1','DT2','DT3','DT4']]
X = df_rates[['RPC2', 'RPC3', 'RPC4']]
y = df_rates['RPC1']

if fit is True:
    results = smf.ols('RPC1 ~ RPC2 + RPC3 + RPC4', df_rates).fit()
    results.save("model.pickle")
else:
    from statsmodels.regression.linear_model import OLSResults
    results = OLSResults.load("model.pickle")

print(results.summary())
res = results.predict(X)

#print(res)
#print(y)

xy = np.vstack([y, res])
z = gaussian_kde(xy)(xy)

fig, ax = plt.subplots()
sc = ax.scatter(y, res, c=z, s=100, edgecolor='')
plt.title('Predicted vs measured RPC rate')
plt.ylabel('Predicted rate')
plt.xlabel('Measured rate')
	def __init__(self, model_file_name, test_file_name):
		self.model_file_name = model_file_name
		self.test_file_name = test_file_name
		self.model = OLSResults.load(self.model_file_name)
		self.testing_set = pd.read_csv(self.test_file_name)
		self.prediction = []
Example #10
0
import statsmodels.api as sm
from statsmodels.regression.linear_model import OLSResults
import numpy as np

nsample = 100
x = np.linspace(0, 10, 100)
X = np.column_stack((x, x**2))
beta = np.array([1, 0.1, 10])
e = np.random.normal(size=nsample)
X = sm.add_constant(X)
y = np.dot(X, beta) + e
model = sm.OLS(y, X)
results = model.fit()
results.save("example.pickle")
new_results = OLSResults.load("example.pickle")
print(new_results.summary())
def calc_granger_causality(x,
                           diff_x,
                           granger_list,
                           group_var,
                           groups,
                           maxlag,
                           both_sides=False,
                           only_min_crit=False,
                           filter_p_value=None):
    """Computes Granger Causality test for two given time series.
    Takes into consideration time series differencing on the basis of
    the table provided in diff_x"""
    """
    PARAMETERS:
    1) x - input data containing chosen time series
    2) diff_x - table containing number of times the time series should
        be differentiated to make it stationary
    3) granger_list - list of two-element tuples. Each tuple contains names of:
        a) first elememt - the name of the time series which is a 
            potential result in  Granger casuality (Y)
        b) second elememt - the name of the time series which is a 
            potential cause in  Granger casuality (X)
        i.e. the structure of the tuple is (Y, X)
    4) group_var - grouping variable in `x` on the basis of which time series
        named in `granger_list` should be divided
    5) groups - values of `group_var` for which Granger causality should be
        checked
    6) maxlag - maximum number of lags for which Granger causality should be
        checked
    7) both_sides - a boolean value indicating whether to check if potential
        causes (X's) are not the results of potential resulting variables
        (i.e. X~Y is checked if both_sided = True)
    8) only_min_crit - a boolean value indicating whether only observations
        with minimum values of AIC and BIC criteria should be reported
        (True if only such observations should be reported)
    9) filter_p_value - significance level of Granger's causality test
        below which the results should be reported; by default all results
        are reported; if you e.g. want to obtain only the results significant
        on significance level alpha = .05, provide filter_p_value = .05

    """

    #    x = data_m_min_date
    #  diff_x = diff_req
    #  granger_list = GRANGER_LIST
    #  group_var = 'tech'
    #  groups = ['tensorflow']
    #  maxlag = 8
    #  both_sides = True
    #  only_min_crit = False
    #  filter_p_value = None

    import pandas as pd
    import numpy as np
    from useful import repeated
    from grangercausalitytests_mod import grangercausalitytests_mod
    from statsmodels.regression.linear_model import OLSResults

    if both_sides == True:
        granger_list_reversed = [x[::-1] for x in granger_list]
        granger_list = granger_list.copy()
        granger_list.extend(granger_list_reversed)
        len_granger_list = len(granger_list)

    if only_min_crit:
        results = pd.DataFrame(columns=[
            'ID', 'group', 'y', 'y_diff', 'x', 'x_diff', 'lag', 'p_value',
            'AIC', 'BIC', 'min_AIC', 'min_BIC'
        ])
    else:
        results = pd.DataFrame(columns=[
            'ID', 'group', 'y', 'y_diff', 'x', 'x_diff', 'lag', 'p_value',
            'AIC', 'BIC'
        ])

    for g in groups:
        for i, gl in enumerate(granger_list):
            #            g = 'tensorflow'
            #            i = 1
            #            gl = ('hn_all_match_score', 'so_usage_cnt')

            if both_sides:  # reversed pairs
                if i >= len_granger_list / 2:
                    i = i - len_granger_list / 2  # ID of reversed pair equals
                    # to the ID of the original pair:
                    # ID(Y~X) = ID(X~Y)

            yvar_diffs = int(diff_x.at[g, gl[0]])
            yvar = repeated(pd.DataFrame.diff,
                            yvar_diffs)(x[x[group_var] == g][gl[0]])
            #            if len(yvar) <= 3 * maxlag + 1: # +1 = constant
            #                warn("The maximum lag was too large")
            #            else:
            xvar_diffs = int(diff_x.at[g, gl[1]])
            xvar = repeated(pd.DataFrame.diff,
                            xvar_diffs)(x[x[group_var] == g][gl[1]])
            gstats = grangercausalitytests_mod(
                (pd.concat([yvar, xvar], axis=1).dropna()),
                maxlag=maxlag,
                verbose=False)
            len_gstats = len(gstats)
            #            print(maxlag)
            #            print(len_gstats)

            df = pd.DataFrame({
                'ID':
                np.repeat(i, len_gstats),
                'group':
                np.repeat(g, len_gstats),
                'y':
                np.repeat(gl[0], len_gstats),
                'y_diff':
                np.repeat(yvar_diffs, len_gstats),
                'x':
                np.repeat(gl[1], len_gstats),
                'x_diff':
                np.repeat(xvar_diffs, len_gstats),
                'lag':
                range(1, len_gstats + 1),
                'p_value': [
                    gstats[x][0]['ssr_ftest'][1]
                    for x in range(1, len_gstats + 1)
                ],
                'AIC': [
                    OLSResults.aic(gstats[x][1][1])
                    for x in range(1, len_gstats + 1)
                ],
                'BIC': [
                    OLSResults.bic(gstats[x][1][1])
                    for x in range(1, len_gstats + 1)
                ],
            })
            if only_min_crit:
                min_AIC = df.loc[df.AIC == min(df.AIC)]
                min_BIC = df.loc[df.BIC == min(df.BIC)]
                if min_AIC.equals(min_BIC):
                    min_AIC = min_AIC.assign(min_AIC=True)
                    min_AIC = min_AIC.assign(min_BIC=True)
                    df = min_AIC
                else:
                    min_AIC = min_AIC.assign(min_AIC=True)
                    min_AIC = min_AIC.assign(min_BIC=False)
                    min_BIC = min_BIC.assign(min_AIC=False)
                    min_BIC = min_BIC.assign(min_BIC=True)
                    df = pd.concat([min_AIC, min_BIC], axis=0)
            results = (pd.concat(
                [results, df],
                axis=0).sort_values(by=['group', 'ID']).reset_index(drop=True))
    if filter_p_value is not None:
        return (results[results.p_value < filter_p_value])
    else:
        return (results)


### End of code
Example #12
0
pred5.corr(wcat["AT"])
#plt.scatter(x=wcat["Waist"], y=wcat["AT"], color="blue");plt.xlabel="Waist";plt.ylabel="AT"
#plt.plot(wcat["Waist"], pred5, color="red")

res5 = wcat.AT - pred5
sqres5 = res5 * res5
mse5 = np.mean(sqres5)
rmse5 = np.sqrt(mse5)

#from sklearn.linear_model import LinearRegression

model5.save("slr_wcat.pkl")

from statsmodels.regression.linear_model import OLSResults
model = OLSResults.load("slr_wcat.pkl")

#type(new_results)
# saving model to disk

#pickle.dump(model5, open("slr_wcat.pkl","wb"))

# loading model to compare results
#slr_wcat = pickle.load(open("slr_wcat.pkl", "rb"))

x = np.exp(
    model.predict(
        pd.DataFrame([[36, 1296, 46656]],
                     columns=["Waist", "Waist_sq", "Waist_cb"])))
print(float(round(x, 2)))
#print(round(np.exp(model5.predict(pd.DataFrame([[80,6400,512000]], columns=["Waist", "Waist_sq", "Waist_cb"]))),2))
def calc_granger_causality(x, diff_x, granger_list, group_var, groups, maxlag,
                           both_sides = False, only_min_crit = False,
                           filter_p_value = None):
    
    """Computes Granger Causality test for two given time series.
    Takes into consideration time series differencing on the basis of
    the table provided in diff_x"""
    """
    PARAMETERS:
    1) x - input data containing chosen time series
    2) diff_x - table containing number of times the time series should
        be differentiated to make it stationary
    3) granger_list - list of two-element tuples. Each tuple contains names of:
        a) first elememt - the name of the time series which is a 
            potential result in  Granger casuality (Y)
        b) second elememt - the name of the time series which is a 
            potential cause in  Granger casuality (X)
        i.e. the structure of the tuple is (Y, X)
    4) group_var - grouping variable in `x` on the basis of which time series
        named in `granger_list` should be divided
    5) groups - values of `group_var` for which Granger causality should be
        checked
    6) maxlag - maximum number of lags for which Granger causality should be
        checked
    7) both_sides - a boolean value indicating whether to check if potential
        causes (X's) are not the results of potential resulting variables
        (i.e. X~Y is checked if both_sided = True)
    8) only_min_crit - a boolean value indicating whether only observations
        with minimum values of AIC and BIC criteria should be reported
        (True if only such observations should be reported)
    9) filter_p_value - significance level of Granger's causality test
        below which the results should be reported; by default all results
        are reported; if you e.g. want to obtain only the results significant
        on significance level alpha = .05, provide filter_p_value = .05

    """
    
#    x = data_m_min_date
#  diff_x = diff_req
#  granger_list = GRANGER_LIST
#  group_var = 'tech'
#  groups = ['tensorflow']
#  maxlag = 8
#  both_sides = True
#  only_min_crit = False
#  filter_p_value = None
 
    import pandas as pd
    import numpy as np
    from useful import repeated
    from grangercausalitytests_mod import grangercausalitytests_mod
    from statsmodels.regression.linear_model import OLSResults
    
    if both_sides == True:
        granger_list_reversed = [x[::-1] for x in granger_list]
        granger_list = granger_list.copy()
        granger_list.extend(granger_list_reversed)
        len_granger_list = len(granger_list)
    
    if only_min_crit:
        results = pd.DataFrame(columns=['ID', 'group', 'y', 'y_diff', 'x',
                                     'x_diff', 'lag', 'p_value', 'AIC', 'BIC',
                                    'min_AIC', 'min_BIC'])
    else:
        results = pd.DataFrame(columns=['ID', 'group', 'y', 'y_diff', 'x',
                                         'x_diff','lag', 'p_value', 'AIC',
                                          'BIC'])
    
    for g in groups:
        for i, gl in enumerate(granger_list):
#            g = 'tensorflow'
#            i = 1
#            gl = ('hn_all_match_score', 'so_usage_cnt')
            
            if both_sides: # reversed pairs
                if i>=len_granger_list/2:
                    i = i - len_granger_list/2 # ID of reversed pair equals
                    # to the ID of the original pair:
                    # ID(Y~X) = ID(X~Y)
                    
            yvar_diffs = int(diff_x.at[g, gl[0]])
            yvar = repeated(pd.DataFrame.diff,
                            yvar_diffs)(x[x[group_var] == g][gl[0]])
#            if len(yvar) <= 3 * maxlag + 1: # +1 = constant
#                warn("The maximum lag was too large")
#            else:
            xvar_diffs = int(diff_x.at[g, gl[1]])
            xvar = repeated(pd.DataFrame.diff,
                            xvar_diffs)(x[x[group_var] == g][gl[1]])
            gstats = grangercausalitytests_mod(
                    (pd.concat([yvar, xvar], axis=1).dropna()),
                    maxlag = maxlag, verbose = False
                    )
            len_gstats = len(gstats)
#            print(maxlag)
#            print(len_gstats)

            df = pd.DataFrame(
                    {'ID': np.repeat(i, len_gstats),
                     'group': np.repeat(g, len_gstats),
                     'y': np.repeat(gl[0], len_gstats),
                     'y_diff': np.repeat(yvar_diffs, len_gstats),
                     'x': np.repeat(gl[1], len_gstats),
                     'x_diff': np.repeat(xvar_diffs, len_gstats),
                     'lag': range(1, len_gstats+1),
                     'p_value': [gstats[x][0]['ssr_ftest'][1]
                                     for x in range(1, len_gstats+1)],
                     'AIC': [OLSResults.aic(gstats[x][1][1])
                                     for x in range(1, len_gstats+1)],
                     'BIC': [OLSResults.bic(gstats[x][1][1])
                                     for x in range(1, len_gstats+1)],
                     }
                 )
            if only_min_crit:
                min_AIC = df.loc[df.AIC == min(df.AIC)]
                min_BIC = df.loc[df.BIC == min(df.BIC)]
                if min_AIC.equals(min_BIC):
                    min_AIC = min_AIC.assign(min_AIC = True)
                    min_AIC = min_AIC.assign(min_BIC = True)
                    df = min_AIC
                else: 
                    min_AIC = min_AIC.assign(min_AIC = True)
                    min_AIC = min_AIC.assign(min_BIC = False)
                    min_BIC = min_BIC.assign(min_AIC = False)
                    min_BIC = min_BIC.assign(min_BIC = True)
                    df = pd.concat([min_AIC, min_BIC], axis = 0)
            results = (pd.concat([results, df], axis = 0)
            .sort_values(by = ['group', 'ID'])
            .reset_index(drop=True))
    if filter_p_value is not None:
        return(results[results.p_value<filter_p_value])
    else:
        return(results)
    
### End of code