def var_inflation(self,df_raw,label_col,thresh=5,transform=False): ''' method to perform the variance threshold analysis on a given Pandas Dataframe. For more information google 'variance_inflation_factor' args: df_raw: pandas dataframe {raw df without scaling or standardization} thresh: threshold to drop the columns thresh <= 1 : not correlated 1 <thresh< 5 : moderately correlated thresh> 5 : highly correlated transform: {optional} whether to drop the columns based on the threshold out: score: pandas dataframe showing Variance Inflation Factors of different columns transformed Dataframe with dropped columns ''' from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF df = df_raw.drop(label_col,axis=1) assert type(df)==pd.core.frame.DataFrame, "'df' should be a Pandas DataFrame" vif = pd.DataFrame(index=df.columns) vif["VIF Factor"] = [VIF(df.values, i) for i in range(df.shape[1])] if transform: return (vif,df.loc[:,(vif['VIF Factor']>thresh).values]) else: return vif
def Validator(X, y, ypred): resids = ypred - y #Durbin Watson from statsmodels.stats.stattools import durbin_watson print( f"Durbin Watson Score (around 2 is good): {round(durbin_watson(resids, axis=0), 4)}" ) #RMSLE from sklearn.metrics import mean_squared_log_error print( f'Root Mean Squared Log Error is: {np.sqrt(mean_squared_log_error(y, ypred))}' ) #QQ-plot for our residuals --> no homoscedasticity import statsmodels.api as sm sm.qqplot(resids, line='r') plt.title(f"How much the quartiles are represented by the dataset") plt.show() #VIF from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF vifs = [VIF(X.values, i) for i, colname in enumerate(X)] s = pd.Series(vifs, index=X.columns) s.plot.bar() plt.title( f"VIF Analysis. Testing linear dependencies (<5 is good)\nthe residual is (0 is good): {round(resids.sum(), 2)}" ) plt.show() #correlation heatmap # sns.heatmap(X.corr()) # plt.show() #Y vs Ypred # plt.scatter(x = y, y = y) # plt.plot(y,ypred, color = 'red') # plt.title(f"Y vs Ypred. (following the line is good)") # plt.show() #plot residuals plt.hist(resids, bins=20) plt.title(f"Residuals in 20 bins (should be normally distributed)") plt.show()
# 在建模前,需要将数据集拆分为训练和测试集 # 首先确定自变量和因变量 final_vars = df_final.columns.values.tolist() var_delete = ['Loan_Status_new','Loan_ID'] X=[i for i in final_vars if i not in var_delete ] df_final_X = df_final[X] df_final_y = df_final['Loan_Status_new'] from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(df_final_X , df_final_y, test_size=0.3,random_state=0) # 查看是否有多重共线性 from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF # 选择变量 vif =pd.DataFrame(columns=['feature','vif']) #VIF数据集 vif['feature'] = X_train.columns #重新命名 vif['vif'] = [VIF(X_train.values,i) for i in range(X_train.shape[1])] #计算VIF的值 print(vif) # 删除VIF过高的变量 del X_train['Loan_Amount_Term'] del X_train['LoanAmount_log'] del X_test['Loan_Amount_Term'] del X_test['LoanAmount_log'] # 4. 模型建立 # 使用sklearn中的逻辑回归库 from sklearn import linear_model from sklearn import metrics clf = linear_model.LogisticRegression() clf.fit(X_train, Y_train) # 5. 模型评估
def Remove_Highly_Collinear_Variables(Pandas_Design_Matrix, VIF_Threshold=5, Display_Indicies=True, Return_Scores=False, Verbose=False): """ Removes Highly Collinear variables from a design matrix above a certain covariance threshold ========================================= Design Matrix = Numpy array dim (X,P) VIF_Threshold = int/float determining threshold at which variable is removed Display_Indicies = Boolean: If True show variable indicies above threshold Return_Scores = Boolean: If True then also return the VIF Scores Verbose = Boolean: If True then display indicie and iteration in dropping of variables as they happen ========================================= """ VIF_scores = [] #List to which all the high VIF scores will go into iteration = 0 #Counter to tell us which iteration we are on column_names = [] #List of dropped indicies PDM_copy = Pandas_Design_Matrix.copy( ) #Dont want to overwrite old dataframe just in case GO = True #Prepare the while loop #Compute the VIF score of each variable if we don't make it to the end of all the columns while GO: iteration += 1 #Iterate over current amount of columns for i in range(PDM_copy.shape[1]): VIF_score = VIF(PDM_copy.values, i) #IF VIF score above threshold: Drop that variable from the matrix and return a new matrix if VIF_score > VIF_Threshold: column_name = PDM_copy.columns[i] column_names.append(column_name) VIF_scores.append(VIF_score) PDM_copy = PDM_copy.drop(column_name, axis=1) #If displaying indicies then say which indicie we dropped if Verbose: print("iteration", str(iteration) + ":", "Found high VIF variable with name:", column_name) #Restart looking for high VIF's on new matrix STOP = False break #Prepare while loop to stop if the for loop gets to the end else: STOP = True #We don't want the while loop to go on forever so we end it because we finished looking for high VIFS. if STOP: GO = False #Say how many variables were dropped print("\n", "Number of Dropped Variables:", len(VIF_scores), "\n") #if wanted, display all the values of the high VIF_scores if Return_Scores and Display_Indicies: print("VIF Scores above threshold:", VIF_scores, "\n") print("Dropped Columns list:", column_names) return PDM_copy, VIF_scores, column_names elif Display_Indicies: print("Dropped Columns list:", column_names) return PDM_copy, column_names elif Return_Scores: print("VIF Scores above threshold:", VIF_scores, "\n") return PDM_copy, VIF_scores else: return PDM_copy
def calculate_vif(x): return pd.Series([VIF(x.values, i) for i in range(x.shape[1])], index=x.columns)
""" Variance Inflation Factor (VIF) measures multicolinearity VIF is greater than 5 means high multicolinearity """ import pandas as pd from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF from matplotlib import pyplot as plt import seaborn as sns df = pd.read_csv('Data/train.csv', index_col=0, parse_dates=True) del df['atemp'] del df['humidity'] df = df.iloc[:, :-3] vifs = [VIF(df.values, i) for i, colname in enumerate(df)] s = pd.Series(vifs, index=df.columns) s.plot.bar() plt.show() sns.heatmap(df.corr()) plt.show()
for i in num_columns: standardize_num_cols(df[i]) scaler=MinMaxScaler() df.Age = scaler.fit_transform(df.Age.values.reshape(-1, 1)) df.Balance = scaler.fit_transform(df.Balance.values.reshape(-1, 1)) df.EstimatedSalary = scaler.fit_transform(df.EstimatedSalary.values.reshape(-1, 1)) # Make Dummies df = pd.get_dummies(data = df,columns=cat_columns, drop_first=True) # VIF And Correlation Matrix df_vif = df.drop(['Churn'], axis=1) pd.Series([VIF(df_vif.values, i) for i in range(df_vif.shape[1])],index=df_vif.columns).sort_values(ascending=False) # VIF Correlation Visualization correlation = df.corr() #tick labels matrix_cols = correlation.columns.tolist() #convert to array corr_array = np.array(correlation) #Plotting trace = go.Heatmap(z = corr_array, x = matrix_cols, y = matrix_cols, colorscale = "Viridis", colorbar = dict(title = "Pearson Correlation coefficient",
# Normal distribution of residuals? plt.hist(residuals, bins=20) # Change in variance - homoscedasticity / heteroscedasticity import statsmodels.api as sm pl = sm.qqplot(residuals, line='r') # Are features linearly independent? from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF vifs = [VIF(df_train.values, i) for i, colname in enumerate(df_train)] s = pd.Series(vifs, index=df_train.columns) s.plot.bar() ########################## # Kaggle test set kaggle_test = pd.read_csv('test.csv', parse_dates=True, index_col=0) def feature_engineering_test(df): # drop columns that test data does not have if 'casual' and 'registered' in df.columns: df.drop(['casual', 'registered'], axis=1, inplace=True) else:
def multicollinearity(self): """ Multicollinearity: Assumes that predictors are not correlated with each other. If there is correlation among the predictors, then either remove prepdictors with high Variance Inflation Factor (VIF) values or perform dimensionality reduction This assumption being violated causes issues with interpretability of the coefficients and the standard errors of the coefficients. """ from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF import matplotlib.pyplot as plt import seaborn as sns from pandas.core.frame import DataFrame sns.set() if type(self.model) == str: self.fit_model() print( '\n=======================================================================================' ) print('Assumption 2: Little to no multicollinearity among predictors') # Plotting the heatmap plt.figure(figsize=(10, 8)) sns.heatmap(DataFrame(self.X, columns=self.features).corr(), annot=len(self.features) < 10, center=0, cmap=sns.diverging_palette(220, 20, as_cmap=True)) plt.title('Correlation of Variables') plt.show() print('Variance Inflation Factors (VIF)') print('> 10: An indication that multicollinearity may be present') print('> 100: Certain multicollinearity among the variables') print('-------------------------------------') # Gathering the VIF for each variable vifs = {i: VIF(self.X, idx) for idx, i in enumerate(self.features)} vifs = dict(sorted(vifs.items(), key=lambda x: x[1], reverse=True)) for key, vif in vifs.items(): print(f'{key}: {vif}') # Gathering and printing total cases of possible or definite multicollinearity possible_multicollinearity = sum( [1 for vif in vifs.values() if vif > 10]) definite_multicollinearity = sum( [1 for vif in vifs.values() if vif > 100]) print() print( f'{possible_multicollinearity} cases of possible multicollinearity' ) print( f'{definite_multicollinearity} cases of definite multicollinearity' ) print() if definite_multicollinearity == 0: if possible_multicollinearity == 0: print('Assumption satisfied') self.results['Satisfied'].append('Multicollinearity') else: print('Assumption possibly satisfied') print() print('Coefficient interpretability may be problematic') print( 'Consider removing variables with a high Variance Inflation Factor (VIF)' ) self.results['Potentially'].append('Multicollinearity') else: print('Assumption not satisfied') print() print('Coefficient interpretability will be problematic') print( 'Consider removing variables with a high Variance Inflation Factor (VIF)' ) self.results['Violated'].append('Multicollinearity')
}) feature_imp.sort_values(by='importance', ascending=False).reset_index(drop=True) ######################### # # # Multi-Collinearity # # # ######################### # Check VIF score of Independent Numerical Variables (before removal) from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF from statsmodels.tools.tools import add_constant df_numeric = add_constant(train[['temp', 'atemp', 'humidity', 'windspeed']]) VIF = pd.Series( [VIF(df_numeric.values, i) for i in range(df_numeric.shape[1])], index=df_numeric.columns) print(VIF) # Check VIF score of Independent Numerical Variables (after removal) from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF from statsmodels.tools.tools import add_constant df_numeric = add_constant(train[['temp', 'humidity', 'windspeed']]) VIF = pd.Series( [VIF(df_numeric.values, i) for i in range(df_numeric.shape[1])], index=df_numeric.columns) print(VIF) ############################ # # # one hot enconding for #
var=df.groupby(['OverTime','Attrition']).Attrition.count() var.unstack().plot(kind='bar',stacked=True,color=['blue','orange'],grid=False,figsize=(10,10)) plt.show() var=df.groupby(['Age_bins','Attrition']).Attrition.count() var.unstack().plot(kind='bar',stacked=True,color=['blue','orange'],grid=False,figsize=(10,10)) plt.show() var=df.groupby(['WorkLifeBalance','Attrition']).Attrition.count() var.unstack().plot(kind='bar',stacked=True,color=['blue','orange'],grid=False,figsize=(10,10)) plt.show() # Test VIF for mutlticolinearity X = df.drop(['Attrition','EmployeeNumber','YearsAtCompany','PercentSalaryHike','TotalWorkingYears','JobInvolvement','WorkLifeBalance','Department_Research & Development'], axis=1) pd.Series([VIF(X.values, i) for i in range(X.shape[1])],index=X.columns).sort_values(ascending=False) # Drop columns for Correlation Matrix data_model = df.drop(['Attrition','EmployeeNumber','YearsAtCompany','PercentSalaryHike','TotalWorkingYears','JobInvolvement','WorkLifeBalance','Department_Research & Development'], axis=1) # Correlation matrix for variables correlation = data_model.corr() matrix_cols = correlation.columns.tolist() corr_array = np.array(correlation) trace = go.Heatmap(z = corr_array, x = matrix_cols, y = matrix_cols, colorscale = "Viridis", colorbar = dict(title = "Pearson Correlation coefficient", titleside = "right"
if 'Binary' in str(type(lm)): print('\nAIC: ' + str(round(lm.aic, 3))) print('\nObserved/Predicted:') pt = lm.pred_table() print(pt) print('acc: ' + str(round((pt[1, 1] + pt[0, 0]) / pt.sum(), 3))) print('sens: ' + str(round(pt[1, 1] / (pt[1, 1] + pt[1, 0]), 3))) print('spec: ' + str(round(pt[0, 0] / (pt[0, 0] + pt[0, 1]), 3))) else: print('\nRMSE: ' + str(round(lm.mse_resid**.5, 3))) # OLS print('\nVIFs:') print(lm.params.index) variables = lm.model.exog print([round(VIF(variables, i), 3) for i in range(variables.shape[1])]) ### Plot Regression Output plot_regression(df_combo, lm) #%% Fish detection rate (eDNA vs. Fish) print('\nFish Stats on Sampling Days') temp = df_combo[[ 'fish_count', 'coho_N_fish', 'coho_N_adult', 'coho_N_juvenile', 'coho_biomass', 'coho_fish_present', 'trout_N_fish', 'trout_N_adult', 'trout_N_juvenile', 'trout_biomass', 'trout_fish_present' ]] temp = temp.dropna() print('# trap days: ' + str(temp['fish_count'].sum())) print('\nSums:') print(temp.sum())