def var_inflation(self,df_raw,label_col,thresh=5,transform=False):
     '''
     method to perform the variance threshold analysis on a given Pandas Dataframe. For more information 
     google 'variance_inflation_factor'
     args:
         df_raw: pandas dataframe {raw df without scaling or standardization}
         thresh: threshold to drop the columns
                 thresh <= 1 : not correlated
                 1 <thresh< 5 : moderately correlated
                 thresh> 5 :    highly correlated
         transform: {optional} whether to drop the columns based on the threshold
         
     out:
         score: pandas dataframe showing Variance Inflation Factors of different columns
         transformed Dataframe with dropped columns
     '''
     
     from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
     
     df = df_raw.drop(label_col,axis=1)
     assert type(df)==pd.core.frame.DataFrame, "'df' should be a Pandas DataFrame"
     
     vif = pd.DataFrame(index=df.columns)
     vif["VIF Factor"] = [VIF(df.values, i) for i in range(df.shape[1])]
     if transform:
         return (vif,df.loc[:,(vif['VIF Factor']>thresh).values])
     else:
         return vif
Esempio n. 2
0
def Validator(X, y, ypred):

    resids = ypred - y
    #Durbin Watson
    from statsmodels.stats.stattools import durbin_watson
    print(
        f"Durbin Watson Score (around 2 is good): {round(durbin_watson(resids, axis=0), 4)}"
    )

    #RMSLE
    from sklearn.metrics import mean_squared_log_error
    print(
        f'Root Mean Squared Log Error is: {np.sqrt(mean_squared_log_error(y, ypred))}'
    )

    #QQ-plot for our residuals --> no homoscedasticity
    import statsmodels.api as sm
    sm.qqplot(resids, line='r')
    plt.title(f"How much the quartiles are represented by the dataset")
    plt.show()

    #VIF
    from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
    vifs = [VIF(X.values, i) for i, colname in enumerate(X)]
    s = pd.Series(vifs, index=X.columns)
    s.plot.bar()
    plt.title(
        f"VIF Analysis. Testing linear dependencies (<5 is good)\nthe residual is (0 is good): {round(resids.sum(), 2)}"
    )
    plt.show()

    #correlation heatmap
    #    sns.heatmap(X.corr())
    #    plt.show()

    #Y vs Ypred
    #    plt.scatter(x = y, y = y)
    #    plt.plot(y,ypred, color = 'red')
    #    plt.title(f"Y vs Ypred. (following the line is good)")
    #    plt.show()

    #plot residuals
    plt.hist(resids, bins=20)
    plt.title(f"Residuals in 20 bins (should be normally distributed)")
    plt.show()
Esempio n. 3
0
# 在建模前,需要将数据集拆分为训练和测试集
# 首先确定自变量和因变量
final_vars = df_final.columns.values.tolist()
var_delete = ['Loan_Status_new','Loan_ID']
X=[i for i in final_vars if i not in var_delete ]
df_final_X  = df_final[X]
df_final_y = df_final['Loan_Status_new']

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df_final_X , df_final_y, test_size=0.3,random_state=0)
# 查看是否有多重共线性
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
# 选择变量
vif =pd.DataFrame(columns=['feature','vif']) #VIF数据集
vif['feature'] = X_train.columns #重新命名
vif['vif'] = [VIF(X_train.values,i) for i in range(X_train.shape[1])] #计算VIF的值
print(vif)

# 删除VIF过高的变量
del  X_train['Loan_Amount_Term']
del X_train['LoanAmount_log']
del  X_test['Loan_Amount_Term']
del X_test['LoanAmount_log']
# 4. 模型建立
# 使用sklearn中的逻辑回归库
from sklearn import linear_model
from sklearn import metrics
clf = linear_model.LogisticRegression()
clf.fit(X_train, Y_train)

# 5. 模型评估
def Remove_Highly_Collinear_Variables(Pandas_Design_Matrix,
                                      VIF_Threshold=5,
                                      Display_Indicies=True,
                                      Return_Scores=False,
                                      Verbose=False):
    """
    Removes Highly Collinear variables from a design matrix above a certain covariance threshold

    =========================================
    Design Matrix = Numpy array dim (X,P)
    VIF_Threshold = int/float determining threshold at which variable is removed
    Display_Indicies = Boolean:
                    If True show variable indicies above threshold
    Return_Scores = Boolean:
                    If True then also return the VIF Scores
    Verbose = Boolean:
                    If True then display indicie and iteration in dropping of variables as they happen
    =========================================

    """

    VIF_scores = []  #List to which all the high VIF scores will go into
    iteration = 0  #Counter to tell us which iteration we are on
    column_names = []  #List of dropped indicies

    PDM_copy = Pandas_Design_Matrix.copy(
    )  #Dont want to overwrite old dataframe just in case

    GO = True  #Prepare the while loop
    #Compute the VIF score of each variable if we don't make it to the end of all the columns
    while GO:
        iteration += 1

        #Iterate over current amount of columns
        for i in range(PDM_copy.shape[1]):
            VIF_score = VIF(PDM_copy.values, i)

            #IF VIF score above threshold: Drop that variable from the matrix and return a new matrix
            if VIF_score > VIF_Threshold:
                column_name = PDM_copy.columns[i]
                column_names.append(column_name)
                VIF_scores.append(VIF_score)
                PDM_copy = PDM_copy.drop(column_name, axis=1)

                #If displaying indicies then say which indicie we dropped
                if Verbose:
                    print("iteration",
                          str(iteration) + ":",
                          "Found high VIF variable with name:", column_name)

                #Restart looking for high VIF's on new matrix
                STOP = False
                break

            #Prepare while loop to stop if the for loop gets to the end
            else:
                STOP = True

        #We don't want the while loop to go on forever so we end it because we finished looking for high VIFS.
        if STOP:
            GO = False

    #Say how many variables were dropped

    print("\n", "Number of Dropped Variables:", len(VIF_scores), "\n")
    #if wanted, display all the values of the high VIF_scores

    if Return_Scores and Display_Indicies:
        print("VIF Scores above threshold:", VIF_scores, "\n")
        print("Dropped Columns list:", column_names)
        return PDM_copy, VIF_scores, column_names

    elif Display_Indicies:
        print("Dropped Columns list:", column_names)
        return PDM_copy, column_names

    elif Return_Scores:
        print("VIF Scores above threshold:", VIF_scores, "\n")
        return PDM_copy, VIF_scores

    else:
        return PDM_copy
Esempio n. 5
0
def calculate_vif(x):
    return pd.Series([VIF(x.values, i) for i in range(x.shape[1])],
                     index=x.columns)
Esempio n. 6
0
"""
Variance Inflation Factor (VIF)

measures multicolinearity

VIF is greater than 5 means high multicolinearity
"""
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from matplotlib import pyplot as plt
import seaborn as sns

df = pd.read_csv('Data/train.csv', index_col=0, parse_dates=True)
del df['atemp']
del df['humidity']
df = df.iloc[:, :-3]

vifs = [VIF(df.values, i) for i, colname in enumerate(df)]
s = pd.Series(vifs, index=df.columns)
s.plot.bar()
plt.show()
sns.heatmap(df.corr())
plt.show()

Esempio n. 7
0
for i in num_columns:
    standardize_num_cols(df[i])
    
scaler=MinMaxScaler()
df.Age = scaler.fit_transform(df.Age.values.reshape(-1, 1))
df.Balance = scaler.fit_transform(df.Balance.values.reshape(-1, 1))
df.EstimatedSalary = scaler.fit_transform(df.EstimatedSalary.values.reshape(-1, 1))
    
# Make Dummies

df = pd.get_dummies(data = df,columns=cat_columns, drop_first=True) 

# VIF And Correlation Matrix

df_vif = df.drop(['Churn'], axis=1)
pd.Series([VIF(df_vif.values, i) for i in range(df_vif.shape[1])],index=df_vif.columns).sort_values(ascending=False)

# VIF Correlation Visualization

correlation = df.corr()
#tick labels
matrix_cols = correlation.columns.tolist()
#convert to array
corr_array  = np.array(correlation)

#Plotting
trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   colorscale = "Viridis",
                   colorbar   = dict(title = "Pearson Correlation coefficient",
# Normal distribution of residuals?

plt.hist(residuals, bins=20)

# Change in variance - homoscedasticity / heteroscedasticity

import statsmodels.api as sm

pl = sm.qqplot(residuals, line='r')

# Are features linearly independent?

from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF

vifs = [VIF(df_train.values, i) for i, colname in enumerate(df_train)]
s = pd.Series(vifs, index=df_train.columns)
s.plot.bar()

##########################

# Kaggle test set

kaggle_test = pd.read_csv('test.csv', parse_dates=True, index_col=0)


def feature_engineering_test(df):
    # drop columns that test data does not have
    if 'casual' and 'registered' in df.columns:
        df.drop(['casual', 'registered'], axis=1, inplace=True)
    else:
Esempio n. 9
0
    def multicollinearity(self):
        """
        Multicollinearity: Assumes that predictors are not correlated with each other. If there is
                           correlation among the predictors, then either remove prepdictors with high
                           Variance Inflation Factor (VIF) values or perform dimensionality reduction
                           This assumption being violated causes issues with interpretability of the 
                           coefficients and the standard errors of the coefficients.
        """
        from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
        import matplotlib.pyplot as plt
        import seaborn as sns
        from pandas.core.frame import DataFrame
        sns.set()

        if type(self.model) == str:
            self.fit_model()

        print(
            '\n======================================================================================='
        )
        print('Assumption 2: Little to no multicollinearity among predictors')
        # Plotting the heatmap
        plt.figure(figsize=(10, 8))
        sns.heatmap(DataFrame(self.X, columns=self.features).corr(),
                    annot=len(self.features) < 10,
                    center=0,
                    cmap=sns.diverging_palette(220, 20, as_cmap=True))
        plt.title('Correlation of Variables')
        plt.show()
        print('Variance Inflation Factors (VIF)')
        print('> 10: An indication that multicollinearity may be present')
        print('> 100: Certain multicollinearity among the variables')
        print('-------------------------------------')
        # Gathering the VIF for each variable
        vifs = {i: VIF(self.X, idx) for idx, i in enumerate(self.features)}
        vifs = dict(sorted(vifs.items(), key=lambda x: x[1], reverse=True))
        for key, vif in vifs.items():
            print(f'{key}: {vif}')
        # Gathering and printing total cases of possible or definite multicollinearity
        possible_multicollinearity = sum(
            [1 for vif in vifs.values() if vif > 10])
        definite_multicollinearity = sum(
            [1 for vif in vifs.values() if vif > 100])
        print()
        print(
            f'{possible_multicollinearity} cases of possible multicollinearity'
        )
        print(
            f'{definite_multicollinearity} cases of definite multicollinearity'
        )
        print()
        if definite_multicollinearity == 0:
            if possible_multicollinearity == 0:
                print('Assumption satisfied')
                self.results['Satisfied'].append('Multicollinearity')
            else:
                print('Assumption possibly satisfied')
                print()
                print('Coefficient interpretability may be problematic')
                print(
                    'Consider removing variables with a high Variance Inflation Factor (VIF)'
                )
                self.results['Potentially'].append('Multicollinearity')

        else:
            print('Assumption not satisfied')
            print()
            print('Coefficient interpretability will be problematic')
            print(
                'Consider removing variables with a high Variance Inflation Factor (VIF)'
            )
            self.results['Violated'].append('Multicollinearity')
})
feature_imp.sort_values(by='importance',
                        ascending=False).reset_index(drop=True)

#########################
#                       #
#   Multi-Collinearity  #
#                       #
#########################

# Check VIF score of Independent Numerical Variables (before removal)
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from statsmodels.tools.tools import add_constant
df_numeric = add_constant(train[['temp', 'atemp', 'humidity', 'windspeed']])
VIF = pd.Series(
    [VIF(df_numeric.values, i) for i in range(df_numeric.shape[1])],
    index=df_numeric.columns)
print(VIF)

# Check VIF score of Independent Numerical Variables (after removal)
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from statsmodels.tools.tools import add_constant
df_numeric = add_constant(train[['temp', 'humidity', 'windspeed']])
VIF = pd.Series(
    [VIF(df_numeric.values, i) for i in range(df_numeric.shape[1])],
    index=df_numeric.columns)
print(VIF)

############################
#                          #
#  one hot enconding for   #
var=df.groupby(['OverTime','Attrition']).Attrition.count()
var.unstack().plot(kind='bar',stacked=True,color=['blue','orange'],grid=False,figsize=(10,10))
plt.show()

var=df.groupby(['Age_bins','Attrition']).Attrition.count()
var.unstack().plot(kind='bar',stacked=True,color=['blue','orange'],grid=False,figsize=(10,10))
plt.show()

var=df.groupby(['WorkLifeBalance','Attrition']).Attrition.count()
var.unstack().plot(kind='bar',stacked=True,color=['blue','orange'],grid=False,figsize=(10,10))
plt.show()

# Test VIF for mutlticolinearity
X = df.drop(['Attrition','EmployeeNumber','YearsAtCompany','PercentSalaryHike','TotalWorkingYears','JobInvolvement','WorkLifeBalance','Department_Research & Development'], axis=1)
pd.Series([VIF(X.values, i) for i in range(X.shape[1])],index=X.columns).sort_values(ascending=False)

# Drop columns for Correlation Matrix
data_model = df.drop(['Attrition','EmployeeNumber','YearsAtCompany','PercentSalaryHike','TotalWorkingYears','JobInvolvement','WorkLifeBalance','Department_Research & Development'], axis=1)


# Correlation matrix for variables
correlation = data_model.corr()
matrix_cols = correlation.columns.tolist()
corr_array  = np.array(correlation)
trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   colorscale = "Viridis",
                   colorbar   = dict(title = "Pearson Correlation coefficient",
                                     titleside = "right"
if 'Binary' in str(type(lm)):
    print('\nAIC: ' + str(round(lm.aic, 3)))
    print('\nObserved/Predicted:')
    pt = lm.pred_table()
    print(pt)
    print('acc: ' + str(round((pt[1, 1] + pt[0, 0]) / pt.sum(), 3)))
    print('sens: ' + str(round(pt[1, 1] / (pt[1, 1] + pt[1, 0]), 3)))
    print('spec: ' + str(round(pt[0, 0] / (pt[0, 0] + pt[0, 1]), 3)))

else:
    print('\nRMSE: ' + str(round(lm.mse_resid**.5, 3)))  # OLS
    print('\nVIFs:')
    print(lm.params.index)
    variables = lm.model.exog
    print([round(VIF(variables, i), 3) for i in range(variables.shape[1])])

### Plot Regression Output
plot_regression(df_combo, lm)
#%% Fish detection rate (eDNA vs. Fish)

print('\nFish Stats on Sampling Days')
temp = df_combo[[
    'fish_count', 'coho_N_fish', 'coho_N_adult', 'coho_N_juvenile',
    'coho_biomass', 'coho_fish_present', 'trout_N_fish', 'trout_N_adult',
    'trout_N_juvenile', 'trout_biomass', 'trout_fish_present'
]]
temp = temp.dropna()
print('# trap days: ' + str(temp['fish_count'].sum()))
print('\nSums:')
print(temp.sum())