def make_ols_sklearn(X, y, test_size=0.20, fit_intercept=False, standardize=False): """ Makes an OLS in sklearn """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) if standardize: ss = StandardScaler() ss.fit(X_train) X_train = ss.transform(X_train) X_test = ss.transform(X_test) ols = LinearRegression(fit_intercept=fit_intercept, normalize=False) ols.fit(X_train, y_train) train_score = ols.score(X_train, y_train) test_score = ols.score(X_test, y_test) cvmae_5 = np.mean(cross_val_score(ols,X , y, cv=5, scoring='neg_mean_absolute_error')) cvmae_10 = np.mean(cross_val_score(ols,X, y, cv=10, scoring='neg_mean_absolute_error')) print(f"train R2 score = {train_score}") print(f"test R2 score = {test_score}\n") print(f"cv5 MSE score = {cvmae_5}") print(f"cv10 MSE score = {cvmae_10}") return ols
def cleanRP_SG(self): from math import exp NoneOrZero=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1', 'BsmtFinType2','BsmtFinSF1','BsmtFinSF2','Alley', 'Fence','GarageType','GarageQual', 'GarageCond','GarageFinish','GarageCars', 'GarageArea','MasVnrArea','MasVnrType','MiscFeature','PoolQC', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF'] mode=['Electrical','Exterior1st','Exterior2nd','FireplaceQu','Functional','KitchenQual','MSZoning','SaleType','Utilities'] mean=['TotalBsmtSF'] columns_with_missing_data=[name for name in self.all.columns if np.sum(self.all[name].isnull()) !=0] columns_with_missing_data.remove('SalePrice') for column in columns_with_missing_data: col_data = self.all[column] #print( 'Cleaning ' + str(np.sum(col_data.isnull())) + ' data entries for column: ' + column ) #log transformation for missing LotFrontage if column=='LotFrontage': ols=linear_model.LinearRegression() my_ind=self.all[self.all['LotFrontage'].isnull()].index y_train=self.all['LotFrontage'].loc[self.all['LotFrontage'].isnull()==False].values x_train=self.all['LotArea'].loc[self.all['LotFrontage'].isnull()==False].values x_train=np.log(x_train) ols.fit(x_train.reshape(-1,1), y_train) #### What happen if we remove the 'reshape' method? for i in my_ind: print(np.log(self.all['LotArea'].loc[i])*ols.coef_) self.all.loc[i,'LotFrontage']=((np.log(self.all.loc[i,'LotArea'])*ols.coef_)+ols.intercept_)[0] #imputing the value of YearBuiltto the GarageYrBlt. elif column=='GarageYrBlt': missing_index=self.all[self.all['GarageYrBlt'].isnull()].index for i in missing_index: self.all.loc[i,'GarageYrBlt']=1871 elif column in mode: # in case of function messing up - remove [0] self.all[column] = [self.all[column].mode()[0] if pd.isnull(x) else x for x in self.all[column]] elif column in mean: self.all[column].fillna(self.all[column].mean(),inplace=True) elif column in NoneOrZero: if col_data.dtype == 'object': no_string = 'None' self.all[column] = [ no_string if pd.isnull(x) else x for x in self.all[column]] else: self.all[column] = [ 0 if pd.isnull(x) else x for x in self.all[column]] else: print( 'Uh oh!!! No cleaning strategy for:' + column )
def regression(df): msk = np.random.rand(len(df)) < 0.75 train = df[msk] test = df[~msk] ols = sm.OLS(train['Number of total People injured and killed'], train.drop('Number of total People injured and killed', 1)) result = ols.fit() return result.summary()
def _regression(self): ''' generate a summary table about the parameters from the regression analysis ''' mask = np.random.rand(len(self._data)) < 0.75 train = self._data[mask] test = self._data[~mask] ols = sm.OLS(train['TOTAL FATALITIES'], train.drop('TOTAL FATALITIES', 1)) result = ols.fit() return result.summary()
def regression(self): ''' generate a summary table about the parameters from the regression analysis ''' df = self._data msk = np.random.rand(len(df)) < 0.75 train = df[msk] test = df[~msk] ols = sm.OLS(train['Total Fatalities'], train.drop('Total Fatalities', 1)) result = ols.fit() print result.summary()
def sg_simpleLM(self): self.test_train_split() x = np.asarray(self.x_train) # x = sm.add_constant(x) ols = linear_model.LinearRegression() # ols = sm.OLS(np.asarray(self.y_train),) model = ols.fit(x, np.asarray(self.y_train)) #.reshape(-1,1) sm_model_pred = model.predict(self.x_test) self.plot_results(sm_model_pred) print('RMSLE from Kaggle: ' + str(self.rmsle(y_pred=sm_model_pred, y_test=self.y_test))) print(self.rmse_cv(model, self.x_train, self.y_train)) self.results_dict['simpleLM'] = self.rmse_cv(model, self.x_train, self.y_train)
def make_ols(df, x_columns, target='price'): """Pass in a DataFrame & your predictive columns to return an OLS regression model """ #set your x and y variables X = df[x_columns] y = df[target] # pass them into stats models OLS package ols = sm.OLS(y, X) #fit your model model = ols.fit() #display the model summarry display(model.summary()) #plot the residuals fig = sm.graphics.qqplot(model.resid, dist=stats.norm, line='r', alpha=.65, fit=True, markerfacecolor="y") plt.xlim(-2.5, 2.5) plt.ylim(-2.5, 2.5) #return model for later use return
sc = StandardScaler() X_train_S=sc.fit_transform(X_train) X_test_S=sc.transform(X_test) model_lr= LinearRegression() model_lr.fit(X_train_S, y_train) y_pred_lr = model_lr.predict(X_test_S) print(model_lr.intercept_) print(model_lr.coef_) ols = sm.OLS(y_train, X_train_S) lr=ols.fit() pvalues = lr.pvalues print(lr.summary()) yF = fulldata_NMiss['Premium'] XF = fulldata_NMiss[['ins_Cancelation', 'Ticket', 'Accident']] XF_train, XF_test, yF_train, yF_test = train_test_split(XF, yF, test_size=0.25, random_state=100) sc = StandardScaler() XF_train_S=sc.fit_transform(XF_train) XF_test_S=sc.transform(XF_test) model_lr= LinearRegression() model_lr.fit(XF_train_S, yF_train)