def VIF(data, target, columns): for i in range(0, columns.shape[0]): y = data[columns[i]] x = target rsq = sm.ols(formula="y~x", data=data).fit().rsquared vif = round(1/(1-rsq),2) print(columns[i], " VIF = " , vif)
def regr(ticker1, ticker2, attr1, attr2): attr_1 = string.replace(attr1+"_1", ' ', '_') attr_2 = string.replace(attr1+"_2", ' ', '_') ticker1 = ticker1[[attr1]].rename(columns={attr1:attr_1}) ticker2 = ticker2[[attr2]].rename(columns={attr2:attr_2}) df = ticker1.join(other=ticker2, how='inner') fml = attr_1+' ~ '+attr_2 return sm.ols(formula=fml, data=df).fit()
def vif_cal(input_data): import statsmodels.formula.api as sm x_vars=input_data xvar_names=input_data.columns vif_lst=[] for i in range(0,xvar_names.shape[0]): y=x_vars[xvar_names[i]] x=x_vars[xvar_names.drop(xvar_names[i])] rsq=sm.ols(formula="y~x", data=x_vars).fit().rsquared vif=round(1/(1-rsq),2) vif_lst.append(vif) vif_df = pd.concat([pd.DataFrame(xvar_names,columns=['Variable']),pd.DataFrame(vif_lst,columns=['VIF'])],axis=1) return vif_df
def regression(csv_file_path, delim=","): """ Runs an OLS regression. This regresses the last column on all other non-index columns, and prints a summary. Args: csv_file_path: the filepath of the csv data file, organized so that the endogenous variable is the last column. """ import pandas as pd from statsmodels.api import OLS as ols df = pd.read_csv(csv_file_path, delim).dropna() X = df[df.columns[1:-1]].astype(int) X['const'] = 1 y = df[df.columns[-1]] print(ols(y, X).fit(const=True).summary())
def fill_regressed_data(S): """ Fill missing returns by linear combinations of assets without missing returns. """ S = S.copy() R = np.log(S).diff() R.iloc[0] = 0 X = R.dropna(1) for col in set(S.columns) - set(X.columns): R[col].iloc[0] = np.nan y = R[col] # fit regression res = ols(y=y, x=X, intercept=True) pred = res.predict(x=X[y.isnull()]) # get absolute prices pred = pred.cumsum() pred += np.log(S[col].dropna().iloc[0]) - pred.iloc[-1] # fill missing data S[col] = S[col].fillna(np.exp(pred)) return S
# # ":" adds a new column to the design matrix with the interaction of the other two columns. "*" will also include the individual columns that were multiplied together: res1 = ols(formula='Lottery ~ Literacy : Wealth - 1', data=df).fit() res2 = ols(formula='Lottery ~ Literacy * Wealth - 1', data=df).fit() print res1.params, '\n' print res2.params # Many other things are possible with operators. Please consult the [patsy docs](https://patsy.readthedocs.org/en/latest/formulas.html) to learn more. # ## Functions # # You can apply vectorized functions to the variables in your model: res = sm.ols(formula='Lottery ~ np.log(Literacy)', data=df).fit() print res.params # Define a custom function: def log_plus_1(x): return np.log(x) + 1. res = sm.ols(formula='Lottery ~ log_plus_1(Literacy)', data=df).fit() print res.params # Any function that is in the calling namespace is available to the formula. # ## Using formulas with models that do not (yet) support them #
return csvdataRowsNumericalCategorical_with_propensity_scores_overlap_only_matched #### [7] Model controlling for blocks import numpy as np import pandas as pd import statsmodels.formula.api as sm df2 = pd.DataFrame( csvdataRowsNumericalCategorical_with_propensity_scores_overlap_only_matched[ 1:], columns=[ element.replace('.', '_') for element in csvdataRowsNumericalCategorical_with_propensity_scores_overlap_only_matched[ 0] ]) FE_ols = sm.ols(formula='Alumni_Donations_2018 ~ Ranked_2017 + C(Match) - 1', data=df2).fit() #FE_ols = sm.ols(formula="Lung_Hospitalizations ~ C(Vaping_Ban, Treatment(reference='0')) + C(State_Id) + C(Year) - 1",data=df).fit() print(FE_ols.summary()) # Super correlated - statistically significant at 0.001 and being ranked increases alumni donations by ~$509,000 in 2018 (to be fair this still seems low, but perhaps the time value of money or this is just illustrative) def build_basic_regression_outputs(OLSResults): # How to get model attributes - https://stackoverflow.com/questions/48522609/retrieve-model-estimates-from-statsmodels number_of_observations = str(int(OLSResults.nobs)) r_squared = str(round(OLSResults.rsquared, 3)) variables = [key for key, value in OLSResults.params.iteritems()] coefficients = OLSResults.params standard_error = OLSResults.bse p_values = OLSResults.pvalues rows = [[variables[i], coefficients[i], standard_error[i], p_values[i]] for i in range(len(coefficients))]
Vapor_df = df[df.Temperature > 151.12 + 27.9058 * df.Pressure**0.237508] # cut off value for densities Vapor_df.tail(3) # mock display values to see how are they are being arranged threedee = plt.figure().gca(projection='3d') threedee.scatter(Vapor_df['Temperature'], Vapor_df['Pressure'], Vapor_df['Density']) threedee.set_xlabel('Temperature (K)') threedee.set_ylabel('Pressure (kPa)') threedee.set_zlabel('Density (kg/m^3)') plt.show() # In[26]: Vapor_df = Vapor_df.assign(Temp_sq=Vapor_df.Temperature**2) resultVapor = sm.ols(formula="Density ~ Temperature+Pressure", data=Vapor_df).fit() forecast = resultVapor.predict(Vapor_df[['Temperature', 'Pressure']]) ErrorV = pd.DataFrame({'Error': forecast - Vapor_df.Density}) print(resultVapor.params) print(resultVapor.summary()) ErrorV.describe() # In[22]: plt.hist(ErrorV.Error, bins=150) # histogram of error distribution plt.title('Error distribution') plt.xlabel('Spread from function (kg/m^3))') plt.ylabel('Frecuency') # ## Result for Vapor portion Density Vapor= aT^2+bT+cP+f #
# ### Multiplicative interactions # # ":" adds a new column to the design matrix with the interaction of the other two columns. "*" will also include the individual columns that were multiplied together: res1 = ols(formula='Lottery ~ Literacy : Wealth - 1', data=df).fit() res2 = ols(formula='Lottery ~ Literacy * Wealth - 1', data=df).fit() print res1.params, '\n' print res2.params # Many other things are possible with operators. Please consult the [patsy docs](https://patsy.readthedocs.org/en/latest/formulas.html) to learn more. # ## Functions # # You can apply vectorized functions to the variables in your model: res = sm.ols(formula='Lottery ~ np.log(Literacy)', data=df).fit() print res.params # Define a custom function: def log_plus_1(x): return np.log(x) + 1. res = sm.ols(formula='Lottery ~ log_plus_1(Literacy)', data=df).fit() print res.params # Any function that is in the calling namespace is available to the formula. # ## Using formulas with models that do not (yet) support them
import pandas as pd import statsmodels.api as sm from sklearn import linear_model df = pd.read_csv(r'joined_filtered_dataset.csv') df = df.drop('id', axis=1) df = df.drop('key_emotion', axis=1) df = df.dropna() X = df.drop("perceived_trust", axis=1) Y = df['perceived_trust'] regr = linear_model.LinearRegression() regr.fit(X, Y) print('Intercept: \n', regr.intercept_) print('Coefficients: \n', regr.coef_) print(regr) """model = sm.OLS(Y, X) results = model.fit()""" print(results) result = sm.ols()
sns.set(style="darkgrid") # Generate a mask for the upper triangle mask = np.zeros_like(alCorr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True f, ax = plt.subplots(figsize = (9, 9)) cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.heatmap(alCorr, mask=mask, annot=False, cmap=cmap, ax=ax) f.tight_layout() #plt.matshow(alCorr) # https://github.com/statsmodels/statsmodels/issues/5343 # !pip install --upgrade patsy import statsmodels.formula.api as sm result = sm.ols(formula="PO4 ~ oPO4", data=algae).fit() # ols: ordinary least square #dir(result) #[(name, type(getattr(result, name))) for name in dir(result)] [name for name in dir(result) if not callable(getattr(result, name))] print (result.params) print (result.summary()) type(result.params) type(algae)
# Now I will apply logistic regression, with ``outcome`` as a dependant variable. # In[10]: plt.scatter(df["BloodPressure"],df["BMI"]) plt.show() # In[11]: import statsmodels.formula.api as sm model = sm.ols(formula='BloodPressure ~ BMI', data=df).fit() model.summary(model) # # Logistic Regression # Now I will take ``outcome`` as a dependant variable and run the test to study is other variables affect the ``outcome`` # In[12]: from sklearn import preprocessing plt.rc("font", size=14) from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split import seaborn as sns
# In[6]: """top 10 in field goal percentage among power forwards with more than 20 games played""" data_FGpct = data.loc[data['GamePlayed'] > 20] data_FGpct = data_FGpct.loc[data_FGpct['Position'] == 'PF'] data_FGpct = data_FGpct.nlargest(10, 'FGPct') data_FGpct = data_FGpct.sort_values(by=['FGPct'], ascending=False) data_FGpct.loc[:,['Player','Position','GamePlayed','Team','FGPct']] # In[8]: """Regression for Points as dependent variable and the following as independent variables: eFGPct, ThreePA, Assist, Turnover, TwoPct, Position as a categorical""" reg = sm.ols("Points ~ eFGPct + ThreePA + Assist + Turnover + TwoPct + C(Position)", data=data).fit() print(reg.summary()) # In[9]: """Regression for Points as dependent variable and the following as independent variables: eFGPct, ThreePA, Assist, ORB, Position as a categorical""" reg = sm.ols("Points ~ eFGPct + ThreePA + Assist + ORB + C(Position)", data=data).fit() print(reg.summary()) # In[10]: