def basic_significance(dataframe, list_to_dummify, target):
    '''
    fits a non-regularized logistic model to target using dataframe predictors
    prints model accuracy and outputs significant coefficients order by absolute magnitude
    ----------
    list_to_dummify: a list of columns in string format that require dummification before modeling
    '''
    #process the dataframe
    df = dataframe.copy()
    df = dummify_columns(df, list_to_dummify)
    X, y = xy_split(df, target)
    X = add_constant(X)
    #fit the model
    logit = Logit(y, X)
    fitted_logit = Logit.fit(logit)
    #store accuracy
    c_mat = confusion_matrix(
        y, np.round(Logit.predict(logit, fitted_logit.params)))
    accuracy = sum(c_mat.diagonal()) / np.sum(c_mat)
    print('model train accuracy: %s' % (accuracy))
    #store significant coefs
    coefs = pd.DataFrame(fitted_logit.pvalues[fitted_logit.pvalues < 0.05])
    coefs['coefs'] = fitted_logit.params.filter(items=coefs.index)
    coefs.columns = ['p-values', 'coefs']
    coefs['abs_coefs'] = np.abs(coefs.coefs)
    coefs = coefs.sort_values(by='abs_coefs', ascending=False)
    coefs = coefs.drop('abs_coefs', axis=1)
    return fitted_logit, coefs
Example #2
0
def score(df):

    X, y = get_X_y(df)

    vif = variance_inflation_factor
    print('VIF: ')
    for i in range(X.shape[1]):
        print(vif(X, i))

    X = add_constant(X)

    model = Logit(y, X).fit()
    print(model.summary(xname=names))

    kfold = KFold(n_splits=5)

    accuracies = []
    precisions = []
    recalls = []

    for train_index, test_index in kfold.split(X):
        model = LogisticRegression(solver="lbfgs")
        model.fit(X[train_index], y[train_index])
        y_predict = model.predict(X[test_index])
        y_true = y[test_index]
        accuracies.append(accuracy_score(y_true, y_predict))
        precisions.append(precision_score(y_true, y_predict))
        recalls.append(recall_score(y_true, y_predict))

    print("Accuracy:", np.average(accuracies))
    print("Precision:", np.average(precisions))
    print("Recall:", np.average(recalls))
    def fit_logit(self):
        '''
        Takes in DF and does logistic regression for X vs Y
        Prints out baseline mode model diagnostics and predicted model diagnostics and ROC curve
        Returns SMOTE X and y values
        '''
        self.y = self.df['repeat'].values
        self.X = self.df.drop(['repeat', 'CustomerNo'], axis=1).values
        #smote the data
        self.X_smote, self.y_smote = smote(self.X, self.y, 0.5)
        self.X_const = add_constant(self.X_smote, prepend=True)
        logit_model = Logit(self.y_smote, self.X_const).fit()
        print(logit_model.summary())
        y_predict = logit_model.predict(self.X_const)

        #check a baseline model that is just the mode assigned to each indivs
        mode_model_acc, mode_model_precision, mode_model_recall = self.mode_cross_val(
            self.X_smote, self.y_smote)
        print("ModelAccuracy: {}, ModelPrecision: {}, ModelRecall: {}".format(
            mode_model_acc, mode_model_precision, mode_model_recall))

        model_acc, model_precision, model_recall = self.logit_cross_val(
            self.X_smote, self.y_smote)
        print("ModelAccuracy: {}, ModelPrecision: {}, ModelRecall: {}".format(
            model_acc, model_precision, model_recall))

        return self.X_smote, self.y_smote
Example #4
0
def logit_reg():
	X_smoted, X_test, y_smoted, y_test = prep_X_y(df, constant=True)
	lm = Logit(y_smoted, X_smoted).fit(method = 'powell')
	y_pred = lm.predict(X_test).round(0)
	print 'Statsmodels Logit Regression--------------------------------'
	print 'Confusion Matrix:', confusion_matrix(y_test, y_pred)
	print 'Accuracy:', accuracy_score(y_test, y_pred)
	print 'Precision:', precision_score(y_test, y_pred)
	print 'Recall:', recall_score(y_test, y_pred)
	return lm
Example #5
0
def logit_reg():
    X_smoted, X_test, y_smoted, y_test = prep_X_y(df, constant=True)
    lm = Logit(y_smoted, X_smoted).fit(method='powell')
    y_pred = lm.predict(X_test).round(0)
    print 'Statsmodels Logit Regression--------------------------------'
    print 'Confusion Matrix:', confusion_matrix(y_test, y_pred)
    print 'Accuracy:', accuracy_score(y_test, y_pred)
    print 'Precision:', precision_score(y_test, y_pred)
    print 'Recall:', recall_score(y_test, y_pred)
    return lm
Example #6
0
class LogReg:
    def __init__(self):
        self.coef_=None

    def fit(self,X,y):
        X=add_constant(X)
        self.lr=Logit(y,X)
        self.l_fitted=self.lr.fit()
        self.coef_=self.l_fitted.params[:-1]

    def predict(self,X):
        if self.coef_ is None:
            print('you must first fit the model')
            return
        X=add_constant(X)
        return(self.lr.predict(self.l_fitted.params,X))
Example #7
0
    # plot
    auc = ax.plot(fpr, tpr, 'b', label='Val AUC = %0.3f' % roc_auc)
    plt.legend(loc='lower right')
    ax.plot([0, 1], [0, 1], 'r--')
    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])
    ax.set_ylabel('True Positive Rate')
    ax.set_xlabel('False Positive Rate')

    return auc


#%%
if __name__ == '__main__':
    from statsmodels.discrete.discrete_model import Logit
    import numpy as np
    import pandas as pd
    from sklearn.metrics import roc_auc_score

    train = pd.read_csv(r'./train.csv')

    logit_model = Logit(train['target'], train.iloc[:, 2:-1])
    logit_model = logit_model.fit(
        disp=0)  # disp=0 Don't show convergence message.

    predsLog = logit_model.predict(train.iloc[:, 2:-1])
    #%%
    fig_1 = plt.figure(figsize=(6, 6))
    aucplot(train['target'], predsLog)
    plt.show()
Example #8
0
# Model Evaluation -------------------------------------------------------------
for ds_name, ds in zip(dataset_names, datasets):
    X_train, y_train = ds

    # Decision Tree
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    score = geometric_mean_score(y_test, y_pred)
    score = round(score, 3)
    print("{} Dataset with Decision Tree: {}".format(ds_name, score))

    # Logistic Regression
    logit = Logit(endog=y_train, exog=X_train)
    result = logit.fit(maxiter=1000, disp=0)
    y_pred = logit.predict(params=result.params, exog=X_test).round()
    score = geometric_mean_score(y_test, y_pred)
    score = round(score, 3)
    print("{} Dataset with Logistic Regression: {}".format(ds_name, score))

    # Random Forest
    rf = RandomForestClassifier(n_estimators=200)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    score = geometric_mean_score(y_test, y_pred)
    score = round(score, 3)
    print("{} Dataset with Random Forest: {}".format(ds_name, score))

# Check effect of each variable ================================================
# Logistic Regression (Coefficients)
logit = Logit(endog=y_rus, exog=X_rus)
Example #9
0
# Logistic regression using 'statsmodels.discrete.discrete_model.Logit'
# 'Logit' expects data in a different format; values of 'y' must be integers

# Make dataset
y_train_sm = [0 if label == 'B' else 1 for label in y_train]
y_train_sm = pd.Series(y_train_sm)
y_test_sm = [0 if label == 'B' else 1 for label in y_test]
y_test_sm = pd.Series(y_test_sm)

# Instantiate a Logistic Regression classifier (using statsmodels)
logit = Logit(endog=y_train_sm, exog=X_train)
result = logit.fit()
print('- Logistic regression result using statsmodels:\n', result.summary())

# Predict probabilities of train & test sets
y_train_sm_prob = logit.predict(params=result.params, exog=X_train)
y_test_sm_prob = logit.predict(params=result.params, exog=X_test)

# Plot predicted probabilities against train set
plt.figure(1)
plt.scatter(
    x=range(X_train.shape[0]),
    y=y_train_sm_prob,
    color=['red' if prob >= 0.5 else 'blue' for prob in y_train_sm_prob],
    alpha=0.5)
plt.title('Sigmoid values for train data')
plt.xlabel('observation number')
plt.ylabel('probability')
plt.show()

# Plot predicted probabilities against train set
Example #10
0
#%%
from statsmodels.discrete.discrete_model import Logit
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import os
dt = datetime.now().strftime('%m-%d-%y-%H')

#%% LOAD DATA
train = pd.read_csv(r'./train.csv')

#%%
logit_model = Logit(train['target'], train.iloc[:, 2:-1])
logit_model = logit_model.fit(disp=0)  # disp=0 Don't show convergence message.
#%%
predsLog = logit_model.predict(train.iloc[:, 2:-1])
predsLog_auc = roc_auc_score(train['target'], predsLog)
print('========================================================')
print('Val_AUC = ', round(predsLog_auc, 5))
print('========================================================')

#%% predict test
test = pd.read_csv(r'./test.csv')
predsLog2 = logit_model.predict(test.iloc[:, 1:-1])
#%% output
testoutput = pd.DataFrame()
testoutput['ID_code'] = test['ID_code'].copy()
testoutput['target'] = predsLog2
testoutput.to_csv(r'./logistic_out_%s.csv' % dt, index=False)
#%%
from datetime import datetime