Beispiel #1
0
def run_sgd(X_train, X_test, y_train, y_test):
    '''
    Function to fit Stochastic Gradient Descent Model and predict on test set
    
    Input: Train and Test Data.
    
    Output: Dataframe with metric scores. Confusion Matrix and ROC Curve plots.
    '''
    
    print('SGD Results:')
    
    # Fit model and get predictions
    model = SGDClassifier()
    X_train_res, y_train_res = helper_functions.smote_train(X_train, y_train)
    clf = model.fit(X_train_res, y_train_res)
    calibrator = CalibratedClassifierCV(clf, cv='prefit')
    model_fit = calibrator.fit(X_train_res, y_train_res)
    y_hat_test = model.predict(X_test)
    
    # Calculate metrics
    prec = precision_score(y_test, y_hat_test)
    recall = recall_score(y_test, y_hat_test)
    acc = accuracy_score(y_test, y_hat_test)
    f1 = f1_score(y_test, y_hat_test)
    print()
    print("Model Metrics:")
    print(f"Precision: {prec}")
    print(f"Recall: {recall}")
    print(f"Accuracy: {acc}")
    print(f"F1_Score: {f1}")
    
    # Plot Confusion Matrix
    skplt.metrics.plot_confusion_matrix(y_test, y_hat_test, figsize = (4,4))
    plt.ylim([1.5, -.5])
    plt.tight_layout()
    plt.show()
    
    # Plot ROC Curve
    fpr, tpr, thresholds = roc_curve(y_test, calibrator.predict_proba(X_test)[:,1])
    AUC = auc(fpr, tpr)
    scores = [prec, recall, acc, f1, AUC]
    print()
    print(f'AUC: {AUC}')
    plt.plot(fpr, tpr, lw = 2, label = 'ROC Curve', color = 'orange')
    plt.plot([0,1], [0,1], lw = 2, linestyle = '--', color = 'r')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.title('ROC Curve and AUC')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.tight_layout()
    plt.show()
    
    print('-'*75)
    
    return pd.DataFrame({
        'Model' : ['SGD' for i in scores],
        'Metric' : ['Precision', 'Recall', 'Accuracy', 'F1_Score', 'AUC'],
        'Score' : scores})
def grid_search_estimators(X_train, y_train):
    '''
    Function to grid search for best n value (# of estimators for random forest)
    
    Input: Training features and target
    
    Output: Optimized n (n_estimators) parameter for Random Forest
    '''

    N = list(range(70, 125, 5))
    hyperparameters = dict(n_estimators=N)
    rf = RandomForestClassifier()
    clf = GridSearchCV(rf, hyperparameters, cv=5, verbose=0)
    X_train_res, y_train_res = helper_functions.smote_train(X_train, y_train)
    grid = clf.fit(X_train_res, y_train_res)
    n = grid.best_estimator_.get_params()['n_estimators']

    return n
def grid_search_neighbors(X_train, y_train):
    '''
    Function to grid search for best k value (# of neighbors for knn)
    
    Input: Training features and target
    
    Output: Optimized k (n_neighbors) parameter for KNN
    '''

    K = list(range(1, 9, 2))
    hyperparameters = dict(n_neighbors=K)
    knn = KNeighborsClassifier()
    clf = GridSearchCV(knn, hyperparameters, cv=5, verbose=0)
    X_train_res, y_train_res = helper_functions.smote_train(X_train, y_train)
    grid = clf.fit(X_train_res, y_train_res)
    k = grid.best_estimator_.get_params()['n_neighbors']

    return k
Beispiel #4
0
def run_linear_svc(X_train, X_test, y_train, y_test):
    '''
    Function to fit Linear Support Vector Machine Model and predict on test set
    
    Input: Train and Test Data.
    
    Output: Dataframe with metric scores. Confusion Matrix.
    '''
    
    print('Linear SVC Results:')
    
    # Fit model and get predictions
    model = LinearSVC()
    X_train_res, y_train_res = helper_functions.smote_train(X_train, y_train)
    model_fit = model.fit(X_train_res, y_train_res)
    y_hat_test = model.predict(X_test)
    
    # Calculate metrics
    prec = precision_score(y_test, y_hat_test)
    recall = recall_score(y_test, y_hat_test)
    acc = accuracy_score(y_test, y_hat_test)
    f1 = f1_score(y_test, y_hat_test)
    print()
    print("Model Metrics:")
    print(f"Precision: {prec}")
    print(f"Recall: {recall}")
    print(f"Accuracy: {acc}")
    print(f"F1_Score: {f1}")
    
    # Plot Confusion Matrix
    skplt.metrics.plot_confusion_matrix(y_test, y_hat_test, figsize = (4,4))
    plt.ylim([1.5, -.5])
    plt.tight_layout()
    plt.show()
    
    scores = [prec, recall, acc, f1]

    print('-'*75)
    
    return pd.DataFrame({
        'Model' : ['Linear_SVC' for i in scores],
        'Metric' : ['Precision', 'Recall', 'Accuracy', 'F1_Score'],
        'Score' : scores})
def grid_search_CP(X_train, y_train):
    '''
    Function to grid search for best C and penalty for Logistic Regression
    
    Input: Training features and target
    
    Output: Optimized C and Penalty parameters for Logistic Regression
    '''

    penalty = ['l1', 'l2']
    C = np.arange(.1, 50, .5)
    hyperparameters = dict(C=C, penalty=penalty)
    lr = LogisticRegression()
    clf = GridSearchCV(lr, hyperparameters, cv=5, verbose=0)
    X_train_res, y_train_res = helper_functions.smote_train(X_train, y_train)
    grid = clf.fit(X_train_res, y_train_res)
    c = grid.best_estimator_.get_params()['C']
    p = grid.best_estimator_.get_params()['penalty']

    return c, p
Beispiel #6
0
def run_random_forest(X_train, X_test, y_train, y_test, n):
    '''
    Function to fit Random Forest Model and predict on test set
    
    Input: Train and Test Data. Optimized n parameter.
    
    Output: Dataframe with metric scores. Confusion Matrix and ROC Curve plots. Feature Importance Plot.
    '''
    
    print('Random Forest Results:')
    
    # Fit model and get predictions
    model = RandomForestClassifier(n_estimators = n)
    X_train_res, y_train_res = helper_functions.smote_train(X_train, y_train)
    model_fit = model.fit(X_train_res, y_train_res)
    y_hat_test = model.predict(X_test)
    
    # Calculate metrics
    prec = precision_score(y_test, y_hat_test)
    recall = recall_score(y_test, y_hat_test)
    acc = accuracy_score(y_test, y_hat_test)
    f1 = f1_score(y_test, y_hat_test)
    print()
    print("Model Metrics:")
    print(f"Precision: {prec}")
    print(f"Recall: {recall}")
    print(f"Accuracy: {acc}")
    print(f"F1_Score: {f1}")
    
    # Plot Confusion Matrix
    skplt.metrics.plot_confusion_matrix(y_test, y_hat_test, figsize = (4,4))
    plt.ylim([1.5, -.5])
    plt.tight_layout()
    plt.show()
    
    # Plot ROC Curve
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
    AUC = auc(fpr, tpr)
    scores = [prec, recall, acc, f1, AUC]
    print()
    print(f'AUC: {AUC}')
    plt.plot(fpr, tpr, lw = 2, label = 'ROC Curve', color = 'orange')
    plt.plot([0,1], [0,1], lw = 2, linestyle = '--', color = 'r')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.title('ROC Curve and AUC')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.tight_layout()
    plt.show()
    
    # Plot feature importances
    features = list(X_train.columns)
    importances = model.feature_importances_
    indices = np.argsort(importances)
    plt.figure(figsize = (10,10))
    plt.title('Feature Importances')
    plt.barh(range(len(indices)), importances[indices], color='darkblue', align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.xlabel('Relative Importance')
    plt.tight_layout()
    plt.show()
    
    print('-'*75)
    
    return pd.DataFrame({
        'Model' : ['Random_Forest' for i in scores],
        'Metric' : ['Precision', 'Recall', 'Accuracy', 'F1_Score', 'AUC'],
        'Score' : scores})
Beispiel #7
0
def run_logreg(X_train, X_test, y_train, y_test, C, penalty):
    '''
    Function to fit Logistic Regression Model and predict on test set
    
    Input: Train and Test Data. Optimized C and Penalty parameters
    
    Output: Dataframe with metric scores. Confusion Matrix and ROC Curve plots.
    '''
    
    # Print test data balances
    y_pos = y_test.target.value_counts()[1]
    drug_user_percent = round(y_pos/ len(y_test), 2)
    print(f'Drug user percent: {drug_user_percent * 100}%')
    print()
    print('Logisitic Regression Results:')
    
    # Fit model and get predictions
    model = LogisticRegression(C = C, penalty = penalty, fit_intercept = False, solver = 'liblinear')
    X_train_res, y_train_res = helper_functions.smote_train(X_train, y_train)
    model_fit = model.fit(X_train_res, y_train_res)
    y_hat_test = model.predict(X_test)
    
    # Calculate metrics
    prec = precision_score(y_test, y_hat_test)
    recall = recall_score(y_test, y_hat_test)
    acc = accuracy_score(y_test, y_hat_test)
    f1 = f1_score(y_test, y_hat_test)
    print()
    print("Model Metrics:")
    print(f"Precision: {prec}")
    print(f"Recall: {recall}")
    print(f"Accuracy: {acc}")
    print(f"F1_Score: {f1}")
    
    # Plot Confusion Matrix
    skplt.metrics.plot_confusion_matrix(y_test, y_hat_test, figsize = (4,4))
    plt.ylim([1.5, -.5])
    plt.tight_layout()
    plt.show()
    
    # Plot ROC Curve
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
    AUC = auc(fpr, tpr)
    scores = [prec, recall, acc, f1, AUC]
    print()
    print(f'AUC: {AUC}')
    plt.plot(fpr, tpr, lw = 2, label = 'ROC Curve', color = 'orange')
    plt.plot([0,1], [0,1], lw = 2, linestyle = '--', color = 'r')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.title('ROC Curve and AUC')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.tight_layout()
    plt.show()
    
    print('-'*75)
    
    return pd.DataFrame({
        'Model' : ['Logistic_Regression' for i in scores],
        'Metric' : ['Precision', 'Recall', 'Accuracy', 'F1_Score', 'AUC'],
        'Score' : scores})