Ejemplo n.º 1
0
def plot_optimal_threshold(model, x_train, y_train):
    """

    """

    # Visualization Threshold
    visualizer = DiscriminationThreshold(model)

    visualizer.fit(x_train, y_train)  # Fit the data to the visualizer
    visualizer.show()
Ejemplo n.º 2
0
 def discrimination_threshold(self) -> None:
     visualizer = DiscriminationThreshold(self.trained_model)
     visualizer.fit(self.X_test,
                    self.y_test)  # Fit the data to the visualizer
     save_dir = f"{self.plots_dir}/discrimination_plot_{self.model_id}.png"
     visualizer.show(outpath=save_dir)
     if not LOCAL:
         upload_to_s3(save_dir,
                      f'plots/discrimination_plot_{self.model_id}.png',
                      bucket=S3_BUCKET_NAME)
     plt.clf()
Ejemplo n.º 3
0
def discrimination_thersold(model, classes, X_train, Y_train, X_test, Y_test):
    from yellowbrick.classifier import DiscriminationThreshold

    # Instantiate the classification model and visualizer

    viz = DiscriminationThreshold(model)

    # visualizer.fit(X, y)  # Fit the training data to the visualizer
    # visualizer.poof()     # Draw/show/poof the data
    viz.fit(X_train, Y_train)
    # viz.score(X_test, Y_test)
    viz.poof()
Ejemplo n.º 4
0
def plot_discrimination_threshold(clf, data='spam', outpath=None):
    if data == 'spam':
        X, y = load_spam()
    elif data == 'churn':
        X, y = load_churn()
    else:
        raise ValueError("no dataset loader '{}'".format(data))

    _, ax = plt.subplots()

    visualizer = DiscriminationThreshold(clf, ax=ax)
    visualizer.fit(X, y)
    visualizer.poof(outpath=outpath)
Ejemplo n.º 5
0
def plot_discrimination_threshold(clf, data='spam', outpath=None):
    if data == 'spam':
        X, y = load_spam()
    elif data == 'churn':
        X, y = load_churn()
    else:
        raise ValueError("no dataset loader '{}'".format(data))

    _, ax = plt.subplots()

    visualizer = DiscriminationThreshold(clf, ax=ax)
    visualizer.fit(X, y)
    visualizer.poof(outpath=outpath)
Ejemplo n.º 6
0
                                 verbose=True)

grid_search.fit(X, y, **fit_params)
opt_parameters = grid_search.best_params_
lgbm_clf = lgbm.LGBMClassifier(**opt_parameters)

# In[89]:

model_performance(lgbm_clf, 'LightGBM')
scores_table(lgbm_clf, 'LightGBM')

# In[90]:

visualizer = DiscriminationThreshold(lgbm_clf)

visualizer.fit(X, y)
visualizer.poof()

# In[91]:

knn_clf = KNeighborsClassifier()

voting_clf = VotingClassifier(estimators=[('lgbm_clf', lgbm_clf),
                                          ('knn', KNeighborsClassifier())],
                              voting='soft',
                              weights=[1, 1])

params = {'knn__n_neighbors': np.arange(1, 30)}

grid = GridSearchCV(estimator=voting_clf, param_grid=params, cv=5)
Ejemplo n.º 7
0
def discrimination():
    X, y = load_spam()
    oz = DiscriminationThreshold(LogisticRegression(solver="lbfgs"),
                                 ax=newfig())
    oz.fit(X, y)
    savefig(oz, "discrimination_threshold")
Ejemplo n.º 8
0
def telecom_churn_prediction(algorithm,
                             name,
                             X_train,
                             X_test,
                             y_train,
                             y_test,
                             cols,
                             cf=None,
                             plot=False,
                             threshold=False):
    #model
    start = time()  # Get start time
    algorithm.fit(X_train, y_train)
    end = time()  # Get end time
    # Calculate the training time
    train_time = round(end - start, 4)

    #predict
    start = time()  # Get start time
    predictions_test = algorithm.predict(X_test)
    end = time()  # Get end time
    # Calculate the training time
    pred_time = round(end - start, 4)

    predictions_train = algorithm.predict(X_train)
    probabilities = algorithm.predict_proba(X_test)

    #coeffs
    if cf != None:
        if cf == "coefficients":
            coefficients = pd.DataFrame(algorithm.coef_.ravel())
        elif cf == "features":
            coefficients = pd.DataFrame(algorithm.feature_importances_)

        column_df = pd.DataFrame(cols)
        coef_sumry = (pd.merge(coefficients,
                               column_df,
                               left_index=True,
                               right_index=True,
                               how="left"))
        coef_sumry.columns = ["coefficients", "features"]
        coef_sumry = coef_sumry.sort_values(by="coefficients", ascending=False)

    print(algorithm)
    print("\n Classification report : \n",
          classification_report(y_test, predictions_test))
    #confusion matrix
    conf_matrix = confusion_matrix(y_test, predictions_test)

    #roc_auc_score
    model_roc_auc = roc_auc_score(y_test, predictions_test)
    print('train')
    print("Accuracy   Score : ", accuracy_score(y_train, predictions_train))
    print("Area under curve : ", roc_auc_score(y_train, predictions_train),
          "\n")
    print('test')
    print("Accuracy   Score :", accuracy_score(y_test, predictions_test))
    print("Area under curve : ", model_roc_auc, "\n")
    fpr, tpr, thresholds = roc_curve(y_test, probabilities[:, 1])

    accuracy = accuracy_score(y_test, predictions_test)
    recallscore = recall_score(y_test, predictions_test)
    precision = precision_score(y_test, predictions_test)
    roc_auc_train = roc_auc_score(y_train, predictions_train)
    roc_auc_test = roc_auc_score(y_test, predictions_test)
    f1score = f1_score(y_test, predictions_test)
    result = pd.DataFrame({
        "Model": [name],
        "Accuracy_score": [accuracy],
        "Recall_score": [recallscore],
        "Precision": [precision],
        "f1_score": [f1score],
        "Area_under_curve(train)": [roc_auc_train],
        "Area_under_curve(test)": [roc_auc_test],
        "train_time": [train_time],
        'pred_time': [pred_time]
    })
    if cf != None:
        plt.figure(figsize=(12, 8))
        #plot confusion matrix
        plt.subplot(221)
        plt.grid(b=None)  #無網格
        plot_confusion_matrix(conf_matrix, ["Not churn", "Churn"])
        plt.subplot(222)
        #plot roc curve
        plt.plot(fpr, tpr, label="ROC Curve")
        plt.title('Receiver operating characteristic')
        plt.xlabel("false positive rate")
        plt.ylabel("true positive rate (recall)")
        #plot coeffs
        sns.set(font_scale=1)
        plt.subplot(212)
        plt.title('Feature Importances')
        plt.xticks(rotation='90')
        sns.barplot(coef_sumry['features'], coef_sumry['coefficients'])
        plt.subplots_adjust(top=1.2,
                            bottom=0.2,
                            left=0.10,
                            right=0.95,
                            hspace=0.25,
                            wspace=0.35)
        if threshold == True:
            #plot threshold
            plt.figure(figsize=(14, 4))
            visualizer = DiscriminationThreshold(algorithm)
            visualizer.fit(X_train, y_train)
            visualizer.poof()
    elif cf == None:
        plt.figure(figsize=(12, 4))
        #plot confusion matrix
        plt.subplot(121)
        plt.grid(b=None)  #無網格
        plot_confusion_matrix(conf_matrix, ["Not churn", "Churn"])
        plt.subplot(122)
        #plot roc curve
        plt.plot(fpr, tpr, label="ROC Curve")
        plt.title('Receiver operating characteristic')
        plt.xlabel("false positive rate")
        plt.ylabel("true positive rate (recall)")
        plt.subplots_adjust(top=1.2,
                            bottom=0.2,
                            left=0.10,
                            right=0.95,
                            hspace=0.25,
                            wspace=0.35)
    return result
Ejemplo n.º 9
0
ax3.set_title('Scoreverteilung')
score = np.linspace(0, 1, 1000)
ax3.plot(score, num_tp1, "-", label = "Signal")
ax3.plot(score, num_tn1, "-", label = "Background")
ax3.set_ylabel("Anzahl")
ax3.set_xlabel("Scorecut")
ax3.legend()
fig3.savefig("plots/forest/Scoredistribution.pdf")

# precision recall threshold curve
# https://www.kaggle.com/kevinarvai/fine-tuning-a-classifier-in-scikit-learn, http://www.scikit-yb.org/en/latest/api/classifier/threshold.html
from yellowbrick.classifier import DiscriminationThreshold
fig5 = plt.figure(5)
ax5 = fig5.add_subplot(111)
visualizer = DiscriminationThreshold(forest, exclude = ("queue_rate", "fscore"), ax = ax5)
visualizer.fit(data_train_X, data_train_y)  # Fit the training data to the visualizer
visualizer.poof(outpath="plots/forest/precrecathresh.pdf")     # Draw/show/poof the data

print(time.clock())

print(confusion_matrix(expected, (predicted_probs[:,1] > 0.3).astype(bool)))
from sklearn.metrics import classification_report
print(classification_report(expected, (predicted_probs[:,1] > 0.3).astype(bool)))


# http://www.scikit-yb.org/en/latest/api/features/importances.html
from yellowbrick.features.importances import FeatureImportances

importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
Ejemplo n.º 10
0
def supervised_prediction(algorithm, training_x, testing_x, training_y,
                          testing_y, cols, cf, threshold_plot):

    #model
    algorithm.fit(training_x, training_y)
    predictions = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)
    #coeffs
    if cf == "coefficients":
        coefficients = pd.DataFrame(algorithm.coef_.ravel())
    elif cf == "features":
        coefficients = pd.DataFrame(algorithm.feature_importances_)

    column_df = pd.DataFrame(cols)
    coef_sumry = (pd.merge(coefficients,
                           column_df,
                           left_index=True,
                           right_index=True,
                           how="left"))
    coef_sumry.columns = ["coefficients", "features"]
    coef_sumry = coef_sumry.sort_values(by="coefficients", ascending=False)

    print(algorithm)
    print("\n Classification report : \n",
          classification_report(testing_y, predictions))
    print("Accuracy   Score : ", accuracy_score(testing_y, predictions))
    #confusion matrix
    conf_matrix = confusion_matrix(testing_y, predictions)
    #roc_auc_score
    model_roc_auc = roc_auc_score(testing_y, predictions)
    print("Area under curve : ", model_roc_auc, "\n")
    fpr, tpr, thresholds = roc_curve(testing_y, probabilities[:, 1])

    #plot confusion matrix
    trace1 = go.Heatmap(z=conf_matrix,
                        x=["Not churn", "Churn"],
                        y=["Not churn", "Churn"],
                        showscale=False,
                        colorscale="Picnic",
                        name="matrix")

    #plot roc curve
    trace2 = go.Scatter(x=fpr,
                        y=tpr,
                        name="Roc : " + str(model_roc_auc),
                        line=dict(color=('rgb(22, 96, 167)'), width=2))
    trace3 = go.Scatter(x=[0, 1],
                        y=[0, 1],
                        line=dict(color=('rgb(205, 12, 24)'),
                                  width=2,
                                  dash='dot'))

    #plot coeffs
    trace4 = go.Bar(x=coef_sumry["features"],
                    y=coef_sumry["coefficients"],
                    name="coefficients",
                    marker=dict(color=coef_sumry["coefficients"],
                                colorscale="Picnic",
                                line=dict(width=.6, color="black")))

    #subplots
    fig = tls.make_subplots(
        rows=2,
        cols=2,
        specs=[[{}, {}], [{
            'colspan': 2
        }, None]],
        subplot_titles=('Confusion Matrix',
                        'Receiver operating characteristic',
                        'Feature Importances'))

    fig.append_trace(trace1, 1, 1)
    fig.append_trace(trace2, 1, 2)
    fig.append_trace(trace3, 1, 2)
    fig.append_trace(trace4, 2, 1)

    fig['layout'].update(showlegend=False,
                         title="Model performance",
                         autosize=False,
                         height=900,
                         width=800,
                         plot_bgcolor='rgba(240,240,240, 0.95)',
                         paper_bgcolor='rgba(240,240,240, 0.95)',
                         margin=dict(b=195))
    fig["layout"]["xaxis2"].update(dict(title="false positive rate"))
    fig["layout"]["yaxis2"].update(dict(title="true positive rate"))
    fig["layout"]["xaxis3"].update(
        dict(showgrid=True, tickfont=dict(size=10), tickangle=90))
    py.iplot(fig)

    if threshold_plot == True:
        visualizer = DiscriminationThreshold(algorithm)
        visualizer.fit(training_x, training_y)
        visualizer.poof()
def discrimination_thresholding(xx,yy,estimatorss,**kwargs):
    vz = DiscriminationThreshold(estimatorss, classes=['Reach, 1 Reach, or L/R Reach', 'Null, Multiple Reaches, Or Multiple Arms'],
        cmap="YlGn", size=(600, 360), **kwargs)
    vz.fit(xx,yy)
    vz.score(xx,yy)
    vz.show()
Ejemplo n.º 12
0
def telecom_churn_prediction_alg(algorithm,
                                 training_x,
                                 testing_x,
                                 training_y,
                                 testing_y,
                                 threshold_plot=True):
    # model
    algorithm.fit(training_x, training_y)
    predictions = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)

    print(algorithm)
    print("\n Classification report : \n",
          classification_report(testing_y, predictions))
    print("Accuracy Score   : ", accuracy_score(testing_y, predictions))
    # confusion matrix
    conf_matrix = confusion_matrix(testing_y, predictions)
    # roc_auc_score
    model_roc_auc = roc_auc_score(testing_y, predictions)
    print("Area under curve : ", model_roc_auc)
    fpr, tpr, thresholds = roc_curve(testing_y, probabilities[:, 1])

    # plot roc curve
    trace1 = go.Scatter(
        x=fpr,
        y=tpr,
        name="Roc : " + str(model_roc_auc),
        line=dict(color=('rgb(22, 96, 167)'), width=2),
    )
    trace2 = go.Scatter(x=[0, 1],
                        y=[0, 1],
                        line=dict(color=('rgb(205, 12, 24)'),
                                  width=2,
                                  dash='dot'))

    # plot confusion matrix
    trace3 = go.Heatmap(z=conf_matrix,
                        x=["Not churn", "Churn"],
                        y=["Not churn", "Churn"],
                        showscale=False,
                        colorscale="Blues",
                        name="matrix",
                        xaxis="x2",
                        yaxis="y2")

    layout = go.Layout(
        dict(title="Model performance",
             autosize=False,
             height=500,
             width=800,
             showlegend=False,
             plot_bgcolor="rgb(243,243,243)",
             paper_bgcolor="rgb(243,243,243)",
             xaxis=dict(title="false positive rate",
                        gridcolor='rgb(255, 255, 255)',
                        domain=[0, 0.6],
                        ticklen=5,
                        gridwidth=2),
             yaxis=dict(title="true positive rate",
                        gridcolor='rgb(255, 255, 255)',
                        zerolinewidth=1,
                        ticklen=5,
                        gridwidth=2),
             margin=dict(b=200),
             xaxis2=dict(domain=[0.7, 1],
                         tickangle=90,
                         gridcolor='rgb(255, 255, 255)'),
             yaxis2=dict(anchor='x2', gridcolor='rgb(255, 255, 255)')))
    data = [trace1, trace2, trace3]
    fig = go.Figure(data=data, layout=layout)

    py.iplot(fig)

    if threshold_plot == True:
        visualizer = DiscriminationThreshold(algorithm)
        visualizer.fit(training_x, training_y)
        visualizer.poof()
Ejemplo n.º 13
0
LR.fit(X_train,y_train)


evaluate_model(LR,X_test,y_test,True) # evaluate model

# let's look discimination 

pred = LR.predict(X_test)
proba = LR.predict_proba(X_test) # discrimination threshold is 0.5, let's find best disc. threshhold.
proba = pd.DataFrame(proba,columns=["0","1"])
proba["Selected Class"] = pred

# try to best threshold to maximize f1 score
vis = DiscriminationThreshold(LR)
vis.fit(X_train,y_train)
vis.poof()  # algorithm trys to maximize f1 score  
# threshold = 0.29

# KNN
from sklearn.neighbors import KNeighborsClassifier


k_scores = {}
for k in range(1,30,2):
    
    KNN = KNeighborsClassifier(n_neighbors=k)
    KNN.fit(X_train,y_train)

    k_scores[k] = [KNN.score(X_test,y_test),roc_auc_score(y_test,KNN.predict(X_test))]
    
fig["layout"]["xaxis3"].update(dict(showgrid = True,tickfont = dict(size = 10),tickangle = 90))
py.iplot(fig)
    


# Usually, we assign an object to a class if the probabily of belonging to this class is above 0.5
# However, this threshold can be adjusted and this function allows to find the optimal value given some metrics (recall, precison, f1, queue rate)

# Find optimal threshold  
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
# The optimal threshold is 0.30. 

# Better method to find optimal threshold 
visualizer = DiscriminationThreshold(classifier)
visualizer.fit(X_train,y_train)
visualizer.poof()

        


##########################################   IMPROVEMENTS 



# Implement SMOTE 
"""
# SMOTE is not appropriate as it does not deal with dummy variables. 
sm = SMOTE()
X_smote, y_smote = sm.fit_resample(X_train, y_train) 
# Use SMOTENC instead, which does. 
Ejemplo n.º 15
0
def train_and_evaluate_classifier(
    algorithm,
    training_x,
    testing_x,
    training_y,
    testing_y,
    cols,
    cf='coefficients',
    threshold_plot=True,
):
    """
    Обучение классификатора на тренировочных данных, оценка прогноза на тестовых,
    и визуализация некоторых метрик качества прогноза.
    algorithm     - использованный алгоритм с методами fit, predict и predict_proba
    training_x    - данные для предсказывающих переменных (обучение)
    testing_x     - данные для предсказывающих переменных (тест)
    training_y    - целевая переменная (обучение)
    training_y    - целевая переменная (тест)
    cf - ["coefficients","features"](коэффициенты для логрегрессии, параметры для деревьев)
    threshold_plot - если True, возвращает для модели threshold plot
    """

    # модель
    algorithm.fit(training_x, training_y)
    predictions = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)

    # коэффициенты
    if cf == "coefficients":
        coefficients = pd.DataFrame(algorithm.coef_.ravel())
    elif cf == "features":
        coefficients = pd.DataFrame(algorithm.feature_importances_)
    else:
        raise ValueError(
            '`coefficients` value must be one of {`coefficients`, `features`}')

    column_df = pd.DataFrame(cols)
    coef_sumry = (pd.merge(coefficients,
                           column_df,
                           left_index=True,
                           right_index=True,
                           how="left"))
    coef_sumry.columns = ["coefficients", "features"]
    coef_sumry = coef_sumry.sort_values(by="coefficients", ascending=False)

    print(algorithm)
    print("\n Отчет по классфицикации : \n",
          classification_report(testing_y, predictions))
    print("Точность : ", accuracy_score(testing_y, predictions))
    # confusion_matrix
    conf_matrix = confusion_matrix(testing_y, predictions)
    # roc_auc_score
    model_roc_auc = roc_auc_score(testing_y, predictions)
    print("Площадь под кривой : ", model_roc_auc, "\n")
    fpr, tpr, thresholds = roc_curve(testing_y, probabilities[:, 1])

    # готовим confusion_matrix для рисования
    trace1 = go.Heatmap(
        z=conf_matrix,
        x=["Пользователи", "Отток"],
        y=["Пользователи", "Отток"],
        showscale=False,
        colorscale="Picnic",
        name="matrix",
    )

    # готовим roc_curve для рисования
    trace2 = go.Scatter(x=fpr,
                        y=tpr,
                        name="Roc : " + str(model_roc_auc),
                        line=dict(color=('rgb(22, 96, 167)'), width=2))
    trace3 = go.Scatter(x=[0, 1],
                        y=[0, 1],
                        line=dict(color=('rgb(205, 12, 24)'),
                                  width=2,
                                  dash='dot'))

    # готовим коэффициенты для рисования
    trace4 = go.Bar(x=coef_sumry["features"],
                    y=coef_sumry["coefficients"],
                    name="coefficients",
                    marker=dict(color=coef_sumry["coefficients"],
                                colorscale="Picnic",
                                line=dict(width=.6, color="black")))

    # рисуем
    fig = tls.make_subplots(rows=2,
                            cols=2,
                            specs=[[{}, {}], [{
                                'colspan': 2
                            }, None]])

    fig.append_trace(trace1, 1, 1)
    fig.append_trace(trace2, 1, 2)
    fig.append_trace(trace3, 1, 2)
    fig.append_trace(trace4, 2, 1)

    fig['layout'].update(
        showlegend=False,
        autosize=False,
        height=900,
        width=800,
        plot_bgcolor='rgba(240,240,240, 0.95)',
        paper_bgcolor='rgba(240,240,240, 0.95)',
        margin=dict(b=195),
    )
    fig["layout"]["xaxis2"].update(dict(title="false positive rate"))
    fig["layout"]["yaxis2"].update(dict(title="true positive rate"))
    fig["layout"]["xaxis3"].update(
        dict(showgrid=True, tickfont=dict(size=10), tickangle=90))
    py.iplot(fig)

    if threshold_plot:
        visualizer = DiscriminationThreshold(algorithm)
        visualizer.fit(training_x, training_y)
        visualizer.poof()