Ejemplo n.º 1
0
def multi_collinearity_test(data, flag=0):
    """
    When flag=0:
    Return values of the determinant of eigenvalues of df.
    The value describe the multi collinearity level.
    The larger value the more unlikely multi collinearity.
    * 0 = perfect collinearity, 1 = no collinearity
    When flag=1:
    When flag=2:
    Return variance inflation factor.
    It quantifies the severity of multicollinearity in an ordinary least squares regression analysis
    A rule of thumb for interpreting the variance inflation factor:
    * 1 = not correlated.
    * Between 1 and 5 = moderately correlated.
    * Greater than 5 = highly correlated
    """
    corr = np.corrcoef(data, rowvar=0)
    if flag == 0:
        res = np.linalg.det(corr)
    elif flag == 1:
        res = np.linalg.eig(corr)
    elif flag == 2:
        from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
        if isinstance(data, pd.core.frame.DataFrame):
            res = {
                c: vif(data.values, data.columns.get_loc(c))
                for c in data.columns
            }
        else:
            res = {i: vif(data, i) for i, v in enumerate(data.T)}
    else:
        raise ValueError('flag {} is not defined.'.format(flag))
    return res
Ejemplo n.º 2
0
 def get_single_vif(group, RHS):
     dmatrix = patsy.dmatrix(formula_like=RHS, data=group)
     vifs = {
         name: vif(dmatrix, index)
         for name, index in dmatrix.design_info.column_name_indexes.items()
     }
     return pd.Series(vifs)
 def compute_vif(self):
     """Compute variance inflation factors for all input variables."""
     vifs = dict()
     for ind, col in enumerate(self.X_train.columns):
         vif_score = vif(np.matrix(self.X_train), ind)
         vifs[col] = vif_score.round(2)
     return vifs
Ejemplo n.º 4
0
 def vif(self):
     """Determine the Variance Inflation Factor (vif) of the coefficients and return a dataframe of the vif's."""
     vif_out = pd.DataFrame()
     predictors = np.array(self.predictors)
     vif_out["VIF Factor"] = [
         vif(predictors, i) for i in range(predictors.shape[1])
     ]
     vif_out["features"] = self.predictors.columns
     return vif_out
Ejemplo n.º 5
0
def vif_test(X):
    vd, vd_out = dd(list), {}
    for i, n in enumerate(X.names):
        try:
            vd_out[n] = round(vif(X.array, i), 4)
        except:
            vd_out[n] = 0.0

    return vd_out
 def vif(self):
     '''Computes variance influence factors for each feature variable'''
     import statsmodels.api as sm
     from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
     lm = sm.OLS(self.target_, sm.add_constant(self.features_)).fit()
     for i in range(self.features_.shape[1]):
         v = vif(np.matrix(self.features_), i)
         print("Variance inflation factor for feature {}: {}".format(
             i, round(v, 2)))
Ejemplo n.º 7
0
def vif_coeffs(M):
	'''
	Description:
		Compute VIF on every column of the matrix M
	Input arugments:
		* M: 2D np.ndarray
	Return:
		* A list of VIF for each column. Size of list is equal to no. of columns.
	'''
	return [vif(M, idx) for idx in range(M.shape[1])]
Ejemplo n.º 8
0
 def _collinear_vif(self):
     """
     Check for collinear features
     """
     for ind in range(self.X.shape[1]):
         value = vif(self.X, ind)
         if value > self.vifMagnitude:
             print self.header[ind] + ' has vif ' + str(value)
             self.collinear = True
     if self.collinear:
         raise Exception('Collinear feature risk')
Ejemplo n.º 9
0
def get_vif():
    '''
    Calculates Variance Inflation Factor for each feature in a dataframe.
    :return: Pandas dataframe of VIF scores for each feature.
    '''
    df, categorical_mappings, config = read_data()
    y = df[config['outcome_feature']]
    X = df[[i for i in df.columns if i not in config['outcome_feature']]]
    X = sm.add_constant(X)
    sm.OLS(y, X).fit().summary()
    vif_scores = [vif(X.values, i) for i in range(X.shape[1])]
    return pd.concat([pd.Series(X.columns), pd.Series(vif_scores)], axis=1).rename(columns={0: 'column', 1: 'vif'})
Ejemplo n.º 10
0
def cal_vif(df, vif_columns):
    """
    计算VIF
    """
    vif_df = df.loc[:, vif_columns].fillna(-999)
    columns = vif_df.columns.tolist()
    vif_ma = vif_df.as_matrix()
    result = {}
    for k, v in enumerate(columns):
        result[v] = vif(vif_ma, k)
    vif_result = pd.Series(result, name='vif')
    vif_result.index.name = 'variable'
    vif_result = vif_result.reset_index()
    return (vif_result)
Ejemplo n.º 11
0
 def calculate(self,X):       
     stop = False
     while not stop:
         columns = X.columns
         scores = np.array([vif(X[columns].values,columns.get_loc(col)) for col in columns])
         if scores.max()>self.thresh:
             max_index = scores.argmax()
             max_col = columns[max_index]
             X = X.drop(max_col,axis=1)
             continue
         else:
             stop = True
                         
     return columns
Ejemplo n.º 12
0
    def vif(self):
        """Computes variance influence factors for each feature variable"""
        if not self.is_fitted:
            print("Model not fitted yet!")
            return None
        import statsmodels.api as sm
        from statsmodels.stats.outliers_influence import (
            variance_inflation_factor as vif, )

        lm = sm.OLS(self.target_, sm.add_constant(self.features_)).fit()
        for i in range(self.features_.shape[1]):
            v = vif(np.matrix(self.features_), i)
            print("Variance inflation factor for feature {}: {}".format(
                i, round(v, 2)))
Ejemplo n.º 13
0
def get_vif(X):
    """
    Takes a pd.DataFrame or 2D np.array
    and prints Variance Inflation Factor 
    for every variable.
    """

    if isinstance(data, pd.DataFrame) == False:
        X = pd.DataFrame(X)

    X['__INTERCEPT'] = np.ones(X.shape[0])

    for i in range(X.shape[1] - 1):
        the_vif = vif(X.values, i)
        print("VIF for column {:03}: {:.02f}".format(i, the_vif))
Ejemplo n.º 14
0
def _collinear_vif(df, thresh=5.):
    """
    Check for collinear features
    """

    x = df.values
    dropped = set([])
    for i in range(x.shape[1]):
        ind = i - len(dropped)
        value = vif(x, ind)
        print ind, value, x.shape
        if value > thresh:
            dropped.add(df.columns[i])
            x = np.delete(x, ind, 1)
    return df[[i for i in df if i not in dropped]]
    def compute(self, data, columns):
        """Checks for multicolinearity of the dataset by using variance inflation factor.

        Args:
            data: your dataset
            columns: not in use; exist due to structural consistency
        Returns:
        """

        print("computing variance_inflation_factor")
        results = {}
        for i in range(data.shape[1]):
            results[data.columns[i]] = vif(data.values, i)

        print(sorted(results.items(), key=operator.itemgetter(1)))

        # returns dataset due to structural consistency
        return data
Ejemplo n.º 16
0
def check_multicollinearity(df_exogs, add_constant=False):
    '''
  Evaluate Variance Inflation Factors and Conditional Index.
  Returns VIF DataFrame
  '''
    if add_constant:
        df = df_exogs.copy()
        df['const'] = 1
    else:
        df = df_exogs
    vif_df = pd.DataFrame({
        'var' : df.columns,\
        'VIF' : [vif(df.values, i) for i in range(df.shape[1])]
    })

    ci = np.linalg.cond(df.values)
    print(f'Condition index: {round(ci, 2)}')
    return vif_df
Ejemplo n.º 17
0
def colineary_analysis(dim_vars, correlation_coef, path, new_indexes):
    #    ck=np.vstack((np.ones(dim_vars.T.shape[0]),dim_vars.T.transpose())).transpose()

    #    vif_results = [vif(ck, i) for i in range(ck.shape[1])]

    for cf in correlation_coef:

        independent = np.vstack((np.ones(
            dim_vars.T[new_indexes.get_local_inner_indices()].shape[0]),
                                 dim_vars.T[:, 0]))
        independent = np.vstack(
            (independent, dim_vars.T[new_indexes.get_local_inner_indices(),
                                     -1]))
        ind_ind = np.array([0, dim_vars.T.shape[1]])
        independent_t = independent.transpose()

        for i in range(1, dim_vars.T.shape[1] - 2):
            prueba = np.vstack(
                (independent_t.transpose(),
                 dim_vars.T[new_indexes.get_local_inner_indices(),
                            i])).transpose()
            vif_r = vif(prueba, prueba.shape[1] - 1)
            if vif_r > 1. / (1. - cf):
                continue
            else:
                independent_t = prueba
                ind_ind = np.append(ind_ind, i)
        ind_ind.sort()

        print '----------------------------------------'
        print 'Analysis of colinearity'
        print 'For a correlation coefficient %f' % (cf)
        print 'the non-colinear snapshots are'
        print ind_ind
        print '----------------------------------------'
        np.savetxt(
            path + 'independent_snapschots_correlation_0_' + str(cf) + '.dat',
            ind_ind)
Ejemplo n.º 18
0
#基本的に説明変数が増えれば増えるほど、重回帰式の精度は高くなると紹介しましたが、それだけがいいことばかりとは限りません。
#当てはめの精度は高いのに、予測精度が低くなることを過学習(オーバーフィッティング)と言います。
#過学習になる原因は、『手持ちデータ』に過剰に適合しすぎたモデルを構築してしまったことです。
#こうなると、いまある『検証用データには当てはまりが良い』が『予測したい新しいデータに回帰式を当てはめると、
#当てはまりが悪くなる』といった減少が起きてしまいます。
#過学習を回避するためには一般的に次に紹介する『クロスバリデーション法』を用います。
#『回帰式を求める分析用のデータ』と、『その当てはまりの良さを確認するためのデータ』の2パターンを用意します。
#今回、重回帰分析用に使用したデータセットには、
#回帰式を求める『train.csv』と当てはまりの良さを確認する『test.csv』の2つが用意されているので、test.csvを使います。

## 多重共線性(マルチコ)
#多重共線性とは、説明変数間で非常に強い相関があることを指し、この値が大きいと回帰係数の分散が大きくなり、モデルの予測結果が悪くなることが知られています。
#ただし、重回帰分析を行う目的が『因果関係の洞察』ではなく、『予測』であれば、気にしなくて大丈夫です。
##summary()の結果でいう、Cond. No.が多重共線性をチェックする指標になります。
#ただし、重回帰分析を行う目的が『因果関係の洞察』ではなく、『予測』であれば、気にしなくて大丈夫です。
#
#summary()の結果でいう、Cond. No.が多重共線性をチェックする指標になります。
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

num_cols = model.exog.shape[1]  # 説明変数の列数
vifs = [vif(model.exog, i) for i in range(0, num_cols)]

pd.DataFrame(vifs, index=model.exog_names, columns=['VIF'])
#
#一般的にVIFの値が10(公式のリファレンスでは、5)を超えると、依存関係が強いため、適切な重回帰分析ができないと言われています。
#
#今回でいうと、ダミー変数化で作成した『week』の列のVIF値がすべて『inf』となっており、依存関係が非常に強いです。
#
#繰り返しになりますが、重回帰分析の目的が『因果関係の洞察』であれば説明変数から除外したほうが無難であり、『予測』が目的であれば除外しなくても大丈夫です。
Ejemplo n.º 19
0
import numpy as np

r2score = r2_score(data['sales'], Y_pred)
print(r2score)

rmse = np.sqrt(mean_squared_error(data['sales'], Y_pred))
print(rmse)

# In[52]:

from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
ind_df = data.iloc[:, :-1]

vif_df = pd.DataFrame()
vif_df["features"] = ind_df.columns
vif_df["VIF Factor"] = [vif(ind_df.values, i) for i in range(ind_df.shape[1])]
vif_df.round(2)

# In[15]:

import statsmodels.formula.api as sm

# create a fitted model with two features
lm_model = sm.ols(formula='sales ~ TV + radio ', data=data).fit()

# print the coefficients
print(lm_model.params)
print(lm_model.summary())

# In[55]:
Ejemplo n.º 20
0
plt.show()

# Based on the Cook's Distance plot, **there are few data points with residuals possibly being outliers.**

# **Variance Inflation Factor (VIF)**
#
# The VIF of eacb predictor allows us to check which factors to a degree cause multicollinearity in our model by dividing the ratio of variance in our multi-linear model by the variance of a simple-linear model.

# In[31]:

from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

# In[32]:

for i in range(len(best_feats.columns)):
    v = vif(np.matrix(best_feats), i)
    print('Variance Inflation Factor for {}: {}'.format(
        best_feats.columns[i], round(v, 2)))

# It seems that two factors in our model, **length of stay** and **available facilities & services**, have VIFs > 10. This means **there is multicollinearity in our model.**

# # Prediction & their Intervals
#
# To test our model on patients with IDs 1-5, we shall create a new dataframe with just those rows, and get predictions from our model

# In[33]:

# Create test data from patient ids 1-5 with best features
test_data = pd.DataFrame(raw_data[:5],
                         columns=[
                             'length of stay', 'routine culturing',
Ejemplo n.º 21
0
# 5. Assess
# Compared actual price vs predicted price to check accuracy using R-squared
#ols_model.summary()

# Implementation of variance inflation factor for multicollinearity removal
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
type(X)

# Convert dataframe to ndarray
x_array = X.values
type(x_array)

# Implementation of multi-collinearity removal
for i in range(len(independent)):
    mvif = [
        vif(X[independent].values, index) for index in range(len(independent))
    ]
    max_vif = max(mvif)
    dindex = mvif.index(max_vif)
    #print("Index", dindex, "MaxVIF", max_vif, "Column", independent[dindex])
    if max_vif > 10:
        independent = independent.delete(dindex)
#print(independent)

Y = data["price"]
X_new = data[independent]
ols_model_1 = sm.OLS(Y, X_new).fit()
ols_model_1.summary()

predict_price_1 = ols_model_1.predict(X_new)
Ejemplo n.º 22
0
model = model.fit(X , Y )

print("The slope(m) of equation is", model.coef_)
print("The intercept/residue (c) is", model.intercept_)

Ypred = model.predict(X)

from sklearn.metrics import  r2_score, mean_absolute_error , mean_squared_error

r2_score(Y, Ypred)

from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

for i in range(len(independent)):
  vif_list = [vif(data[independent].values, index) for index in range(len(independent))]
  mvif = max(vif_list)
  print("Max VIF value:",mvif)
  drop_index = vif_list.index(mvif)
  if mvif > 10:
    print("deleting", independent[drop_index])
    independent = independent.delete(drop_index)
print("Final Independent Variables", independent)

import statsmodels.api as sm
Y = data["price"]
X = data[independent]
model = sm.OLS(Y,X)
model = model.fit()
model.summary()
Ejemplo n.º 23
0
    'RMSE by Linear Regression: ',
    numerical.sqrt(
        metrics.mean_squared_error(testDataY, predictionByLinearRegression)))

variableForStatisticX = statistic.add_constant(trainDataX)

print(variableForStatisticX.head())

variableForEST = statistic.OLS(trainDataY, variableForStatisticX)

variableForESTVisualization = variableForEST.fit()

print(variableForESTVisualization.summary())

VIFS = [
    vif(variableForStatisticX.values, i)
    for i in range(len(variableForStatisticX.columns))
]

matrix.Series(data=VIFS, index=variableForStatisticX.columns)

############ PART 2. GMDH FOR REGRESSION ANALYSIS ############

GMDH = MultilayerGMDH(ref_functions='linear')

GMDHModel = GMDH.fit(trainDataX, trainDataY)

predictionByGMDH = GMDH.predict(testDataX)

figures.scatter(testDataY, predictionByGMDH)

# Variance inflation factors

# In[21]:


# don't forget to add constant if the ols model includes intercept
df_exog = sm.add_constant(df.drop('medv', axis = 1))

# too fancy for printing results?
for i, col in enumerate(df.columns):
    if col == 'const':
        pass
    elif len(col) > 6:
        print(col, ':', "{0:.2f}".format(vif(df_exog.as_matrix(), i)))
    else:
        print(col, '\t:', "{0:.2f}".format(vif(df_exog.as_matrix(), i)))


#  Run a regression excluding *age* predictor (formula = 'medv ~ . - age')

# In[22]:


lm = smf.ols(formula = ols_formula(df, 'medv', 'age'), data = df)
lm_fit = lm.fit()
lm_fit.summary()


# ### 3.6.4 Interaction Terms
Ejemplo n.º 25
0
import pandas as pd
from sklearn.linear_model import LinearRegression as lm
import statsmodels.formula.api as smf
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
df = pd.read_csv("bike.csv")
df.head()

features = "+".join(df.columns[1:-3])
y, X = dmatrices("casual ~ " + features, df, return_type = "dataframe")

df_vif = pd.DataFrame()
df_vif["VIF"] = [vif(X.values, i) for i in range(X.shape[1])]
df_vif["features"] = X.columns
df_vif

model1 = smf.ols("casual ~ " + features, data = df)
print(model1.fit().summary())

X_df = df.iloc[:, 1:-3]
model2 = lm().fit(X_df, y)
model2.predict(X_df.iloc[:3, :])
# Fit the Ordinary Least Squared Regression Model
import statsmodels.api as sm
model = sm.OLS(Y, X)

# Train the model
model = model.fit()

# Check the model summary
model.summary()

# Calculate variance inflation factor
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
for i in range(len(independent_variables)):
    vif_list = [
        vif(data[independent_variables].values, index)
        for index in range(len(independent_variables))
    ]
    mvif = max(vif_list)
    print("Max VIF value is", mvif)
    drop_index = vif_list.index(mvif)
    print("For the Independent variable", independent_variables[drop_index])
    if mvif > 10:
        print("Deleting", independent_variables[drop_index])
        independent_variables = independent_variables.delete(drop_index)
print("Final Independent Variables", independent_variables)

Y = data["price"]
X = data[independent_variables]
model = sm.OLS(Y, X)
model = model.fit()
Ejemplo n.º 27
0
corr = round(dataWOE.corr(),2)

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True             
plt.figure(figsize = (5, 5))
cmap = sns.diverging_palette(220, 10, as_cmap=True) 
sns.heatmap(corr, mask=mask, cmap=cmap, center=0, annot =True, cbar_kws={"shrink": .5})
plt.show()



"""选择方差共线性<10的变量"""
col = np.array(data[short_list_2])
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
for i in range(len(short_list_2)):                                              
    print ('{} VIF是{}'.format(short_list_2[i], vif(col, i))) 
        
        

"""判断显著性"""
X = data[short_list_2]
X['intercept'] = [1]*X.shape[0]
y = data['target']

import statsmodels.api as sm
lr_sm=sm.Logit(y, X).fit()
lr_sm.summary()



'''建模'''
Ejemplo n.º 28
0
def judge_vif(X):
    from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
    vif_data = pd.DataFrame([])
    vif_data["VIF_Factor"] = [vif(X.values, i) for i in range(X.shape[1])]
    vif_data["features"] = X.columns
    return vif_data
Ejemplo n.º 29
0
def colin_test():

    #Test the collinearity of the logistic equations by using VFE
    from sklearn.metrics import r2_score
    from scipy.stats import spearmanr

    #BARRA
    logit = LogisticRegression(class_weight="balanced", solver="liblinear")
    pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
     "barra_allvars_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
     is_pss="hss", model_name="barra_fc_v3")
    #Convective AWS
    event = "is_conv_aws"
    preds = ["eff_lcl", "U1", "sb_cape", "lr13", "rhmin03", "lr36", "eff_cin"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df1 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)

    preds = ["eff_lcl", "U1", "sb_cape", "lr13", "rhmin03", "eff_cin"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df2 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)

    preds = ["ml_el", "Umean06", "lr36", "rhmin13", "dcape"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df3 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)

    preds = ["ml_el", "Umean06", "rhmin13", "dcape"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df4 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)

    (pd.concat([df1, df2], axis=1)).to_csv(
        "/g/data/eg3/ab4502/ExtremeWind/skill_scores/vif_barra_aws.csv",
        float_format="%.2e")
    print(pd.concat([df1, df2], axis=1))

    #Test CV HSS scores
    #preds = ["eff_lcl","U1","sb_cape","lr13","rhmin03","eff_cin"]
    #barra_aws = logit_predictor_test("barra", "is_conv_aws", preds, "t_totals", 16)

    #STA
    preds = ["ml_cape", "Umean06", "eff_lcl", "scld"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df1 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)

    print(df1)

    #ERA5
    pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
     "era5_allvars_v2_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
     is_pss="hss", model_name="era5")
    #Convective AWS
    preds = ["ml_el", "Umean03", "eff_lcl", "dpd700", "lr36", "rhmin01"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df1 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)
    preds = ["ml_el", "Umean03", "eff_lcl", "dpd700", "rhmin01"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df2 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)
    (pd.concat([df1, df2], axis=1)).to_csv(
        "/g/data/eg3/ab4502/ExtremeWind/skill_scores/vif_era5_aws.csv",
        float_format="%.2e")
    print(pd.concat([df1, df2], axis=1))
    #Test CV HSS scores
    #preds = ["ml_el","Umean03","eff_lcl","dpd700","rhmin01"]
    #era5_aws = logit_predictor_test("era5", "is_sta", preds, "t_totals", 16)

    #STA
    preds = ["ml_cape", "Umean06", "srhe_left", "lr13"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df1 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)
    print(df1)
Ejemplo n.º 30
0
# In[105]:

import seaborn as sns

corr_df = X.corr(method="pearson")
print(corr_df)

sns.heatmap(corr_df, vmax=1.0, vmin=-1.0, annot=True)

# In[106]:

from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

vif_df = pd.DataFrame()
vif_df['features'] = X.columns
vif_df['VIF Factor'] = [vif(X.values, i) for i in range(X.shape[1])]
vif_df.round(2)

# In[107]:

from sklearn.model_selection import train_test_split

#split the data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=10)

# In[108]:

from sklearn.linear_model import LinearRegression