Ejemplo n.º 1
0
def plot_discrimination_threshold(clf, data='spam', outpath=None):
    if data == 'spam':
        X, y = load_spam()
    elif data == 'churn':
        X, y = load_churn()
    else:
        raise ValueError("no dataset loader '{}'".format(data))

    _, ax = plt.subplots()

    visualizer = DiscriminationThreshold(clf, ax=ax)
    visualizer.fit(X, y)
    visualizer.poof(outpath=outpath)
Ejemplo n.º 2
0
def plot_optimal_threshold(model, x_train, y_train):
    """

    """

    # Visualization Threshold
    visualizer = DiscriminationThreshold(model)

    visualizer.fit(x_train, y_train)  # Fit the data to the visualizer
    visualizer.show()
Ejemplo n.º 3
0
 def discrimination_threshold(self) -> None:
     visualizer = DiscriminationThreshold(self.trained_model)
     visualizer.fit(self.X_test,
                    self.y_test)  # Fit the data to the visualizer
     save_dir = f"{self.plots_dir}/discrimination_plot_{self.model_id}.png"
     visualizer.show(outpath=save_dir)
     if not LOCAL:
         upload_to_s3(save_dir,
                      f'plots/discrimination_plot_{self.model_id}.png',
                      bucket=S3_BUCKET_NAME)
     plt.clf()
Ejemplo n.º 4
0
def discrimination_thersold(model, classes, X_train, Y_train, X_test, Y_test):
    from yellowbrick.classifier import DiscriminationThreshold

    # Instantiate the classification model and visualizer

    viz = DiscriminationThreshold(model)

    # visualizer.fit(X, y)  # Fit the training data to the visualizer
    # visualizer.poof()     # Draw/show/poof the data
    viz.fit(X_train, Y_train)
    # viz.score(X_test, Y_test)
    viz.poof()
Ejemplo n.º 5
0
def plot_discrimination_threshold(clf, data='spam', outpath=None):
    if data == 'spam':
        X, y = load_spam()
    elif data == 'churn':
        X, y = load_churn()
    else:
        raise ValueError("no dataset loader '{}'".format(data))

    _, ax = plt.subplots()

    visualizer = DiscriminationThreshold(clf, ax=ax)
    visualizer.fit(X, y)
    visualizer.poof(outpath=outpath)
def selectDiscr():
    data_path = "labeled_data.csv"
    data = pd.read_csv(data_path)

    # We create the preprocessing pipelines for both numeric and categorical data.
    numeric_features = ['count_reviews', 'rating']
    numeric_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='median')), ('scaler',
                                                    StandardScaler())])

    categorical_features = ['product_category']
    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value='missing')
                ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer, numeric_features
                       ), ('cat', categorical_transformer,
                           categorical_features)])

    viz = DiscriminationThreshold(LogisticRegression())

    clf = VisualPipeline(steps=[
        ('preprocessor', preprocessor),
        #('classifier', LogisticRegression(solver='lbfgs')),
        ('viz', viz)
    ])

    X = data.drop('label', axis=1)
    y = data['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    model = clf.fit(X_train, y_train)
    model.poof()
Ejemplo n.º 7
0
def classification(fname="classification.png"):

    # Create side-by-side axes grid
    _, axes = plt.subplots(ncols=2, figsize=(18, 6))

    # Add ClassificationReport to the reft
    data = load_spam(split=True)
    oz = ClassificationReport(MultinomialNB(),
                              classes=["ham", "spam"],
                              ax=axes[0])
    oz.fit(data.X.train, data.y.train)
    oz.score(data.X.test, data.y.test)
    oz.finalize()

    # Add DiscriminationThreshold to the right
    data = load_spam(split=False)
    oz = DiscriminationThreshold(LogisticRegression(), ax=axes[1])
    oz.fit(data.X, data.y)
    oz.finalize()

    # Save figure
    path = os.path.join(FIGURES, fname)
    plt.tight_layout()
    plt.savefig(path)
Ejemplo n.º 8
0
 def evaluate_visualizer(self, classes=None, params={}):
     LOGGER.info('Initializing plot model')
     if os.path.isdir(os.path.join(os.getcwd(), 'visualizer/')) == False:
         os.makedirs(os.path.join(os.getcwd(), 'visualizer/'))
     if classes is None:
         classes = pd.value_counts(self.y.values.flatten()).index.tolist()
     visualizers = []
     for idx, (name_model, estimator) in enumerate(self.estimator.items()):
         X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
             self.X,
             self.y,
             test_size=0.2,
             stratify=self.y,
             random_state=24)
         try:
             LOGGER.info('Visualizer ClassificationReport')
             visualizer = ClassificationReport(model=estimator,
                                               classes=classes)
             if visualizer.__class__.__name__ in params.keys():
                 visualizer = ClassificationReport(
                     **params[visualizer.__class__.__name__])
             visualizer.fit(X_train, y_train)
             visualizer.score(X_test, y_test)
             visualizer.show(outpath=os.path.join(
                 os.getcwd(),
                 f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png'
             ))
             plt.cla()
         except:
             LOGGER.warn('ERROR ClassificationReport')
         try:
             LOGGER.info('Visualizer ConfusionMatrix')
             visualizer = ConfusionMatrix(model=estimator, classes=classes)
             if visualizer.__class__.__name__ in params.keys():
                 visualizer = ConfusionMatrix(
                     **params[visualizer.__class__.__name__])
             visualizer.fit(X_train, y_train)
             visualizer.score(X_test, y_test)
             visualizer.show(outpath=os.path.join(
                 os.getcwd(),
                 f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png'
             ))
             plt.cla()
         except:
             LOGGER.warn('ERROR ConfusionMatrix')
         try:
             LOGGER.info('Visualizer ROCAUC')
             visualizer = ROCAUC(model=estimator, classes=classes)
             if visualizer.__class__.__name__ in params.keys():
                 visualizer = ROCAUC(
                     **params[visualizer.__class__.__name__])
             visualizer.fit(X_train, y_train)
             visualizer.score(X_test, y_test)
             visualizer.show(outpath=os.path.join(
                 os.getcwd(),
                 f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png'
             ))
             plt.cla()
         except:
             LOGGER.warn('ERROR ROCAUC')
         try:
             LOGGER.info('Visualizer PrecisionRecallCurve')
             visualizer = PrecisionRecallCurve(model=estimator,
                                               per_class=True,
                                               classes=classes)
             if visualizer.__class__.__name__ in params.keys():
                 visualizer = PrecisionRecallCurve(
                     **params[visualizer.__class__.__name__])
             visualizer.fit(X_train, y_train)
             visualizer.score(X_test, y_test)
             visualizer.show(outpath=os.path.join(
                 os.getcwd(),
                 f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png'
             ))
             plt.cla()
         except:
             LOGGER.warn('ERROR PrecisionRecallCurve')
         try:
             LOGGER.info('Visualizer ClassPredictionError')
             visualizer = ClassPredictionError(model=estimator,
                                               classes=classes)
             if visualizer.__class__.__name__ in params.keys():
                 visualizer = ClassPredictionError(
                     **params[visualizer.__class__.__name__])
             visualizer.fit(X_train, y_train)
             visualizer.score(X_test, y_test)
             visualizer.show(outpath=os.path.join(
                 os.getcwd(),
                 f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png'
             ))
             plt.cla()
         except:
             LOGGER.warn('ERROR ClassPredictionError')
         try:
             LOGGER.info('Visualizer Discrimination Threshold')
             visualizer = DiscriminationThreshold(model=estimator,
                                                  classes=classes)
             if visualizer.__class__.__name__ in params.keys():
                 visualizer = DiscriminationThreshold(
                     **params[visualizer.__class__.__name__])
             visualizer.fit(X_train, y_train)
             visualizer.score(X_test, y_test)
             visualizer.show(outpath=os.path.join(
                 os.getcwd(),
                 f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png'
             ))
             plt.cla()
         except:
             LOGGER.warn('ERROR Discrimination Threshold')
Ejemplo n.º 9
0
# %%
classification_report(clf, X, y)

# %%
visualizer = ROCAUC(clf, classes=class_names)
visualizer.score(X, y)
visualizer.poof()

# %%
visualizer = ClassPredictionError(clf, classes=class_names)
visualizer.score(X, y)
visualizer.poof()

# %%
visualizer = DiscriminationThreshold(clf)
visualizer.fit(X, y)
visualizer.poof()

# %%
keep = [263, 268, 287, 288, 300, 302, 307, 308, 313, 315]

# %%
seed = 15
test_size = 0.33
Xt, Xv, yt, yv = \
    sklearn.model_selection.train_test_split(
        X[keep], y, test_size=test_size, stratify=y, random_state=seed)

# %%
explainer = shap.TreeExplainer(clf)
Ejemplo n.º 10
0
ax3 = fig3.add_subplot(111)
ax3.set_title('Scoreverteilung')
score = np.linspace(0, 1, 1000)
ax3.plot(score, num_tp1, "-", label = "Signal")
ax3.plot(score, num_tn1, "-", label = "Background")
ax3.set_ylabel("Anzahl")
ax3.set_xlabel("Scorecut")
ax3.legend()
fig3.savefig("plots/forest/Scoredistribution.pdf")

# precision recall threshold curve
# https://www.kaggle.com/kevinarvai/fine-tuning-a-classifier-in-scikit-learn, http://www.scikit-yb.org/en/latest/api/classifier/threshold.html
from yellowbrick.classifier import DiscriminationThreshold
fig5 = plt.figure(5)
ax5 = fig5.add_subplot(111)
visualizer = DiscriminationThreshold(forest, exclude = ("queue_rate", "fscore"), ax = ax5)
visualizer.fit(data_train_X, data_train_y)  # Fit the training data to the visualizer
visualizer.poof(outpath="plots/forest/precrecathresh.pdf")     # Draw/show/poof the data

print(time.clock())

print(confusion_matrix(expected, (predicted_probs[:,1] > 0.3).astype(bool)))
from sklearn.metrics import classification_report
print(classification_report(expected, (predicted_probs[:,1] > 0.3).astype(bool)))


# http://www.scikit-yb.org/en/latest/api/features/importances.html
from yellowbrick.features.importances import FeatureImportances

importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
Ejemplo n.º 11
0
def supervised_prediction(algorithm, training_x, testing_x, training_y,
                          testing_y, cols, cf, threshold_plot):

    #model
    algorithm.fit(training_x, training_y)
    predictions = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)
    #coeffs
    if cf == "coefficients":
        coefficients = pd.DataFrame(algorithm.coef_.ravel())
    elif cf == "features":
        coefficients = pd.DataFrame(algorithm.feature_importances_)

    column_df = pd.DataFrame(cols)
    coef_sumry = (pd.merge(coefficients,
                           column_df,
                           left_index=True,
                           right_index=True,
                           how="left"))
    coef_sumry.columns = ["coefficients", "features"]
    coef_sumry = coef_sumry.sort_values(by="coefficients", ascending=False)

    print(algorithm)
    print("\n Classification report : \n",
          classification_report(testing_y, predictions))
    print("Accuracy   Score : ", accuracy_score(testing_y, predictions))
    #confusion matrix
    conf_matrix = confusion_matrix(testing_y, predictions)
    #roc_auc_score
    model_roc_auc = roc_auc_score(testing_y, predictions)
    print("Area under curve : ", model_roc_auc, "\n")
    fpr, tpr, thresholds = roc_curve(testing_y, probabilities[:, 1])

    #plot confusion matrix
    trace1 = go.Heatmap(z=conf_matrix,
                        x=["Not churn", "Churn"],
                        y=["Not churn", "Churn"],
                        showscale=False,
                        colorscale="Picnic",
                        name="matrix")

    #plot roc curve
    trace2 = go.Scatter(x=fpr,
                        y=tpr,
                        name="Roc : " + str(model_roc_auc),
                        line=dict(color=('rgb(22, 96, 167)'), width=2))
    trace3 = go.Scatter(x=[0, 1],
                        y=[0, 1],
                        line=dict(color=('rgb(205, 12, 24)'),
                                  width=2,
                                  dash='dot'))

    #plot coeffs
    trace4 = go.Bar(x=coef_sumry["features"],
                    y=coef_sumry["coefficients"],
                    name="coefficients",
                    marker=dict(color=coef_sumry["coefficients"],
                                colorscale="Picnic",
                                line=dict(width=.6, color="black")))

    #subplots
    fig = tls.make_subplots(
        rows=2,
        cols=2,
        specs=[[{}, {}], [{
            'colspan': 2
        }, None]],
        subplot_titles=('Confusion Matrix',
                        'Receiver operating characteristic',
                        'Feature Importances'))

    fig.append_trace(trace1, 1, 1)
    fig.append_trace(trace2, 1, 2)
    fig.append_trace(trace3, 1, 2)
    fig.append_trace(trace4, 2, 1)

    fig['layout'].update(showlegend=False,
                         title="Model performance",
                         autosize=False,
                         height=900,
                         width=800,
                         plot_bgcolor='rgba(240,240,240, 0.95)',
                         paper_bgcolor='rgba(240,240,240, 0.95)',
                         margin=dict(b=195))
    fig["layout"]["xaxis2"].update(dict(title="false positive rate"))
    fig["layout"]["yaxis2"].update(dict(title="true positive rate"))
    fig["layout"]["xaxis3"].update(
        dict(showgrid=True, tickfont=dict(size=10), tickangle=90))
    py.iplot(fig)

    if threshold_plot == True:
        visualizer = DiscriminationThreshold(algorithm)
        visualizer.fit(training_x, training_y)
        visualizer.poof()
def discrimination_thresholding(xx,yy,estimatorss,**kwargs):
    vz = DiscriminationThreshold(estimatorss, classes=['Reach, 1 Reach, or L/R Reach', 'Null, Multiple Reaches, Or Multiple Arms'],
        cmap="YlGn", size=(600, 360), **kwargs)
    vz.fit(xx,yy)
    vz.score(xx,yy)
    vz.show()
Ejemplo n.º 13
0
    visualizer = ClassPredictionError(model)
    visualizer.score(X_test, y_test)
    visualizer.show()

    # 分类报告
    visualizer = ClassificationReport(model)
    visualizer.score(X_test, y_test)
    visualizer.show()

    # 混淆矩阵
    visualizer = ConfusionMatrix(model)
    visualizer.score(X_test, y_test)
    visualizer.show()

    # 阈值选择
    visualizer = DiscriminationThreshold(model)
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 学习率
    visualizer = LearningCurve(model, scoring='f1_weighted')
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 交叉验证
    visualizer = CVScores(model, cv=5, scoring='f1_weighted')
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 特征重要性
    visualizer = FeatureImportances(model)
Ejemplo n.º 14
0
### ROC-AUC

from yellowbrick.classifier import ROCAUC

visualizer = ROCAUC(LogisticRegression(), classes=classes)

visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.poof()

### Class Prediction Error

from yellowbrick.classifier import ClassPredictionError

visualizer = ClassPredictionError(LogisticRegression(), classes=classes)

visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.poof()

### Discrimination Threshold

# Only works for binary classification

from yellowbrick.classifier import DiscriminationThreshold

visualizer = DiscriminationThreshold(LogisticRegression())

visualizer.fit(X, y)
visualizer.poof()
Ejemplo n.º 15
0
def train_and_evaluate_classifier(
    algorithm,
    training_x,
    testing_x,
    training_y,
    testing_y,
    cols,
    cf='coefficients',
    threshold_plot=True,
):
    """
    Обучение классификатора на тренировочных данных, оценка прогноза на тестовых,
    и визуализация некоторых метрик качества прогноза.
    algorithm     - использованный алгоритм с методами fit, predict и predict_proba
    training_x    - данные для предсказывающих переменных (обучение)
    testing_x     - данные для предсказывающих переменных (тест)
    training_y    - целевая переменная (обучение)
    training_y    - целевая переменная (тест)
    cf - ["coefficients","features"](коэффициенты для логрегрессии, параметры для деревьев)
    threshold_plot - если True, возвращает для модели threshold plot
    """

    # модель
    algorithm.fit(training_x, training_y)
    predictions = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)

    # коэффициенты
    if cf == "coefficients":
        coefficients = pd.DataFrame(algorithm.coef_.ravel())
    elif cf == "features":
        coefficients = pd.DataFrame(algorithm.feature_importances_)
    else:
        raise ValueError(
            '`coefficients` value must be one of {`coefficients`, `features`}')

    column_df = pd.DataFrame(cols)
    coef_sumry = (pd.merge(coefficients,
                           column_df,
                           left_index=True,
                           right_index=True,
                           how="left"))
    coef_sumry.columns = ["coefficients", "features"]
    coef_sumry = coef_sumry.sort_values(by="coefficients", ascending=False)

    print(algorithm)
    print("\n Отчет по классфицикации : \n",
          classification_report(testing_y, predictions))
    print("Точность : ", accuracy_score(testing_y, predictions))
    # confusion_matrix
    conf_matrix = confusion_matrix(testing_y, predictions)
    # roc_auc_score
    model_roc_auc = roc_auc_score(testing_y, predictions)
    print("Площадь под кривой : ", model_roc_auc, "\n")
    fpr, tpr, thresholds = roc_curve(testing_y, probabilities[:, 1])

    # готовим confusion_matrix для рисования
    trace1 = go.Heatmap(
        z=conf_matrix,
        x=["Пользователи", "Отток"],
        y=["Пользователи", "Отток"],
        showscale=False,
        colorscale="Picnic",
        name="matrix",
    )

    # готовим roc_curve для рисования
    trace2 = go.Scatter(x=fpr,
                        y=tpr,
                        name="Roc : " + str(model_roc_auc),
                        line=dict(color=('rgb(22, 96, 167)'), width=2))
    trace3 = go.Scatter(x=[0, 1],
                        y=[0, 1],
                        line=dict(color=('rgb(205, 12, 24)'),
                                  width=2,
                                  dash='dot'))

    # готовим коэффициенты для рисования
    trace4 = go.Bar(x=coef_sumry["features"],
                    y=coef_sumry["coefficients"],
                    name="coefficients",
                    marker=dict(color=coef_sumry["coefficients"],
                                colorscale="Picnic",
                                line=dict(width=.6, color="black")))

    # рисуем
    fig = tls.make_subplots(rows=2,
                            cols=2,
                            specs=[[{}, {}], [{
                                'colspan': 2
                            }, None]])

    fig.append_trace(trace1, 1, 1)
    fig.append_trace(trace2, 1, 2)
    fig.append_trace(trace3, 1, 2)
    fig.append_trace(trace4, 2, 1)

    fig['layout'].update(
        showlegend=False,
        autosize=False,
        height=900,
        width=800,
        plot_bgcolor='rgba(240,240,240, 0.95)',
        paper_bgcolor='rgba(240,240,240, 0.95)',
        margin=dict(b=195),
    )
    fig["layout"]["xaxis2"].update(dict(title="false positive rate"))
    fig["layout"]["yaxis2"].update(dict(title="true positive rate"))
    fig["layout"]["xaxis3"].update(
        dict(showgrid=True, tickfont=dict(size=10), tickangle=90))
    py.iplot(fig)

    if threshold_plot:
        visualizer = DiscriminationThreshold(algorithm)
        visualizer.fit(training_x, training_y)
        visualizer.poof()
Ejemplo n.º 16
0
def discrimination():
    X, y = load_spam()
    oz = DiscriminationThreshold(LogisticRegression(solver="lbfgs"),
                                 ax=newfig())
    oz.fit(X, y)
    savefig(oz, "discrimination_threshold")
Ejemplo n.º 17
0
def telecom_churn_prediction_alg(algorithm,
                                 training_x,
                                 testing_x,
                                 training_y,
                                 testing_y,
                                 threshold_plot=True):
    # model
    algorithm.fit(training_x, training_y)
    predictions = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)

    print(algorithm)
    print("\n Classification report : \n",
          classification_report(testing_y, predictions))
    print("Accuracy Score   : ", accuracy_score(testing_y, predictions))
    # confusion matrix
    conf_matrix = confusion_matrix(testing_y, predictions)
    # roc_auc_score
    model_roc_auc = roc_auc_score(testing_y, predictions)
    print("Area under curve : ", model_roc_auc)
    fpr, tpr, thresholds = roc_curve(testing_y, probabilities[:, 1])

    # plot roc curve
    trace1 = go.Scatter(
        x=fpr,
        y=tpr,
        name="Roc : " + str(model_roc_auc),
        line=dict(color=('rgb(22, 96, 167)'), width=2),
    )
    trace2 = go.Scatter(x=[0, 1],
                        y=[0, 1],
                        line=dict(color=('rgb(205, 12, 24)'),
                                  width=2,
                                  dash='dot'))

    # plot confusion matrix
    trace3 = go.Heatmap(z=conf_matrix,
                        x=["Not churn", "Churn"],
                        y=["Not churn", "Churn"],
                        showscale=False,
                        colorscale="Blues",
                        name="matrix",
                        xaxis="x2",
                        yaxis="y2")

    layout = go.Layout(
        dict(title="Model performance",
             autosize=False,
             height=500,
             width=800,
             showlegend=False,
             plot_bgcolor="rgb(243,243,243)",
             paper_bgcolor="rgb(243,243,243)",
             xaxis=dict(title="false positive rate",
                        gridcolor='rgb(255, 255, 255)',
                        domain=[0, 0.6],
                        ticklen=5,
                        gridwidth=2),
             yaxis=dict(title="true positive rate",
                        gridcolor='rgb(255, 255, 255)',
                        zerolinewidth=1,
                        ticklen=5,
                        gridwidth=2),
             margin=dict(b=200),
             xaxis2=dict(domain=[0.7, 1],
                         tickangle=90,
                         gridcolor='rgb(255, 255, 255)'),
             yaxis2=dict(anchor='x2', gridcolor='rgb(255, 255, 255)')))
    data = [trace1, trace2, trace3]
    fig = go.Figure(data=data, layout=layout)

    py.iplot(fig)

    if threshold_plot == True:
        visualizer = DiscriminationThreshold(algorithm)
        visualizer.fit(training_x, training_y)
        visualizer.poof()
# Compute the accuracy: accuracy
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))

#%%
#Measuign performance
train_score2= xgbc.score(X_train, y_train)
test_score2 = xgbc.score(X_test, y_test)
print(train_score1)
print(test_score1)
#%%
#Measuign performance
print(confusion_matrix(y_test, preds))
print(classification_report(y_test, preds))
print(accuracy_score(y_test, preds))

#%%
#ROC
# Instantiate the visualizer with the classification model
visualizer = ROCAUC(xgbc, classes=["will not default", "will default"])

visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()  

#%%
visualizer = DiscriminationThreshold(xgbc)

visualizer.fit(X, y)        
# Fit the data to the visualizer
visualizer.show()  
Ejemplo n.º 19
0
LR = LogisticRegression()

LR.fit(X_train,y_train)


evaluate_model(LR,X_test,y_test,True) # evaluate model

# let's look discimination 

pred = LR.predict(X_test)
proba = LR.predict_proba(X_test) # discrimination threshold is 0.5, let's find best disc. threshhold.
proba = pd.DataFrame(proba,columns=["0","1"])
proba["Selected Class"] = pred

# try to best threshold to maximize f1 score
vis = DiscriminationThreshold(LR)
vis.fit(X_train,y_train)
vis.poof()  # algorithm trys to maximize f1 score  
# threshold = 0.29

# KNN
from sklearn.neighbors import KNeighborsClassifier


k_scores = {}
for k in range(1,30,2):
    
    KNN = KNeighborsClassifier(n_neighbors=k)
    KNN.fit(X_train,y_train)

    k_scores[k] = [KNN.score(X_test,y_test),roc_auc_score(y_test,KNN.predict(X_test))]
Ejemplo n.º 20
0
def telecom_churn_prediction(algorithm,
                             name,
                             X_train,
                             X_test,
                             y_train,
                             y_test,
                             cols,
                             cf=None,
                             plot=False,
                             threshold=False):
    #model
    start = time()  # Get start time
    algorithm.fit(X_train, y_train)
    end = time()  # Get end time
    # Calculate the training time
    train_time = round(end - start, 4)

    #predict
    start = time()  # Get start time
    predictions_test = algorithm.predict(X_test)
    end = time()  # Get end time
    # Calculate the training time
    pred_time = round(end - start, 4)

    predictions_train = algorithm.predict(X_train)
    probabilities = algorithm.predict_proba(X_test)

    #coeffs
    if cf != None:
        if cf == "coefficients":
            coefficients = pd.DataFrame(algorithm.coef_.ravel())
        elif cf == "features":
            coefficients = pd.DataFrame(algorithm.feature_importances_)

        column_df = pd.DataFrame(cols)
        coef_sumry = (pd.merge(coefficients,
                               column_df,
                               left_index=True,
                               right_index=True,
                               how="left"))
        coef_sumry.columns = ["coefficients", "features"]
        coef_sumry = coef_sumry.sort_values(by="coefficients", ascending=False)

    print(algorithm)
    print("\n Classification report : \n",
          classification_report(y_test, predictions_test))
    #confusion matrix
    conf_matrix = confusion_matrix(y_test, predictions_test)

    #roc_auc_score
    model_roc_auc = roc_auc_score(y_test, predictions_test)
    print('train')
    print("Accuracy   Score : ", accuracy_score(y_train, predictions_train))
    print("Area under curve : ", roc_auc_score(y_train, predictions_train),
          "\n")
    print('test')
    print("Accuracy   Score :", accuracy_score(y_test, predictions_test))
    print("Area under curve : ", model_roc_auc, "\n")
    fpr, tpr, thresholds = roc_curve(y_test, probabilities[:, 1])

    accuracy = accuracy_score(y_test, predictions_test)
    recallscore = recall_score(y_test, predictions_test)
    precision = precision_score(y_test, predictions_test)
    roc_auc_train = roc_auc_score(y_train, predictions_train)
    roc_auc_test = roc_auc_score(y_test, predictions_test)
    f1score = f1_score(y_test, predictions_test)
    result = pd.DataFrame({
        "Model": [name],
        "Accuracy_score": [accuracy],
        "Recall_score": [recallscore],
        "Precision": [precision],
        "f1_score": [f1score],
        "Area_under_curve(train)": [roc_auc_train],
        "Area_under_curve(test)": [roc_auc_test],
        "train_time": [train_time],
        'pred_time': [pred_time]
    })
    if cf != None:
        plt.figure(figsize=(12, 8))
        #plot confusion matrix
        plt.subplot(221)
        plt.grid(b=None)  #無網格
        plot_confusion_matrix(conf_matrix, ["Not churn", "Churn"])
        plt.subplot(222)
        #plot roc curve
        plt.plot(fpr, tpr, label="ROC Curve")
        plt.title('Receiver operating characteristic')
        plt.xlabel("false positive rate")
        plt.ylabel("true positive rate (recall)")
        #plot coeffs
        sns.set(font_scale=1)
        plt.subplot(212)
        plt.title('Feature Importances')
        plt.xticks(rotation='90')
        sns.barplot(coef_sumry['features'], coef_sumry['coefficients'])
        plt.subplots_adjust(top=1.2,
                            bottom=0.2,
                            left=0.10,
                            right=0.95,
                            hspace=0.25,
                            wspace=0.35)
        if threshold == True:
            #plot threshold
            plt.figure(figsize=(14, 4))
            visualizer = DiscriminationThreshold(algorithm)
            visualizer.fit(X_train, y_train)
            visualizer.poof()
    elif cf == None:
        plt.figure(figsize=(12, 4))
        #plot confusion matrix
        plt.subplot(121)
        plt.grid(b=None)  #無網格
        plot_confusion_matrix(conf_matrix, ["Not churn", "Churn"])
        plt.subplot(122)
        #plot roc curve
        plt.plot(fpr, tpr, label="ROC Curve")
        plt.title('Receiver operating characteristic')
        plt.xlabel("false positive rate")
        plt.ylabel("true positive rate (recall)")
        plt.subplots_adjust(top=1.2,
                            bottom=0.2,
                            left=0.10,
                            right=0.95,
                            hspace=0.25,
                            wspace=0.35)
    return result
Ejemplo n.º 21
0
rf_probas = rf.predict_proba(X_test)[:, 1]
plot_precision_recall(y_test, rf_probas)

from yellowbrick.classifier import PrecisionRecallCurve

viz = PrecisionRecallCurve(rf)
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.poof()

# Discimination Threshold - probability or score at which the positive class is chosen over the negative class

from yellowbrick.classifier import DiscriminationThreshold

viz = DiscriminationThreshold(rf)
viz.fit(X_train, y_train)
viz.poof()

# Average Precision

from sklearn.metrics import average_precision_score

average_precision_score(
    y_test,
    rf.predict_proba(X_test)[:, 1])  # slice to give probs of class 1

# AUC and ROC curve

from sklearn.metrics import roc_auc_score
fig["layout"]["yaxis2"].update(dict(title = "true positive rate"))
fig["layout"]["xaxis3"].update(dict(showgrid = True,tickfont = dict(size = 10),tickangle = 90))
py.iplot(fig)
    


# Usually, we assign an object to a class if the probabily of belonging to this class is above 0.5
# However, this threshold can be adjusted and this function allows to find the optimal value given some metrics (recall, precison, f1, queue rate)

# Find optimal threshold  
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
# The optimal threshold is 0.30. 

# Better method to find optimal threshold 
visualizer = DiscriminationThreshold(classifier)
visualizer.fit(X_train,y_train)
visualizer.poof()

        


##########################################   IMPROVEMENTS 



# Implement SMOTE 
"""
# SMOTE is not appropriate as it does not deal with dummy variables. 
sm = SMOTE()
X_smote, y_smote = sm.fit_resample(X_train, y_train) 
Ejemplo n.º 23
0
                                 refit=True,
                                 random_state=random_state,
                                 verbose=True)

grid_search.fit(X, y, **fit_params)
opt_parameters = grid_search.best_params_
lgbm_clf = lgbm.LGBMClassifier(**opt_parameters)

# In[89]:

model_performance(lgbm_clf, 'LightGBM')
scores_table(lgbm_clf, 'LightGBM')

# In[90]:

visualizer = DiscriminationThreshold(lgbm_clf)

visualizer.fit(X, y)
visualizer.poof()

# In[91]:

knn_clf = KNeighborsClassifier()

voting_clf = VotingClassifier(estimators=[('lgbm_clf', lgbm_clf),
                                          ('knn', KNeighborsClassifier())],
                              voting='soft',
                              weights=[1, 1])

params = {'knn__n_neighbors': np.arange(1, 30)}
Ejemplo n.º 24
0
def discrimination_threshold(ax=None):
    data = load_spam(return_dataset=True)
    X, y = data.to_pandas()

    viz = DiscriminationThreshold(RandomForestClassifier(n_estimators=10), ax=ax)
    return tts_plot(viz, X, y, score=False)