def evaluate_partitions(keep_bin_edges, df_processed):
    """ This function evaluates a lightweight classifier according to the thresholds.
        Inputs are a list of bin-edges for the continuous target and the processed df.
    """
    # initialize the empty lists
    accs = []
    aucs = []
    mccs = []
    apcs = []

    accs_control = []
    aucs_control = []
    mccs_control = []
    apcs_control = []

    threshs = []
    bin_pct = []

    # starting data percentile
    pct = 0.0
    # binning parameters fixed - DO NOT CHANGE
    num_bins = 10
    num_trials = 10
    # sweep through all bin edges
    for bin_edge in keep_bin_edges:

        threshold = bin_edge
        # obtain the X,y matrices
        X, X_control, y = partition_data(df_processed, threshold)
        # starting data percentile
        pct += 1 / num_bins
        for trial in range(num_trials):
            # get the training, testing, and control data-sets
            x_train_idf, y_train, x_test_idf, y_test, x_control_idf = split_transform_data(
                X, X_control, y)
            # fit the classifier
            clf = ComplementNB(alpha=0.1,
                               class_prior=None,
                               fit_prior=True,
                               norm=False)
            clf.fit(x_train_idf, y_train)

            # evaluate on test and control sets
            accs.append(clf.score(x_test_idf, y_test))
            accs_control.append(clf.score(x_control_idf, y))

            y_pred = clf.predict(x_test_idf)
            y_pred_cont = clf.predict(x_control_idf)

            mccs.append(mcc(y_test, y_pred))
            mccs_control.append(mcc(y, y_pred_cont))

            y_proba = clf.predict_proba(x_test_idf)
            y_cont_proba = clf.predict_proba(x_control_idf)

            aucs.append(roc_auc_score(y_test, y_proba[:, 1]))
            aucs_control.append(roc_auc_score(y, y_cont_proba[:, 1]))

            apcs.append(apscore(y_test, y_proba[:, 1]))
            apcs_control.append(apscore(y, y_cont_proba[:, 1]))

            threshs.append(threshold)
            bin_pct.append(pct)

    # populate into a df for downstream analysis
    df_eval = pd.DataFrame()
    df_eval['data percentile'] = bin_pct  # data percentile
    df_eval['threshold'] = threshs  # bin edge
    df_eval['test accuracy'] = accs  # accuracy
    df_eval['test mcc'] = mccs  # matthews correlation coefficient
    df_eval['test auc'] = aucs  # roc-auc
    df_eval['test ap'] = apcs  # average precision
    df_eval['control accuracy'] = accs_control
    df_eval['control mcc'] = mccs_control
    df_eval['control auc'] = aucs_control
    df_eval['control ap'] = apcs_control

    return df_eval
clf_log.fit(train_x_vectors, y_train)

y_pred = clf_log.predict(test_x_vectors)
clf_log.score(test_x_vectors, y_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)
#using Naive byaes complementNB


from  sklearn.naive_bayes import ComplementNB
#creating the classifier
clf_compnb = ComplementNB()
y_pred2 = clf_compnb.fit(train_x_vectors, y_train).predict(test_x_vectors)
confusion_matrix(y_test,y_pred2)
clf_compnb.score(test_x_vectors, y_test)














Example #3
0
X_train, X_test, y_train, y_test = train_test_split(
    word_vec,
    lyrics_sub['genre'],
    test_size=0.20,
    stratify=lyrics_sub['genre'])

print("At CNB")
#Create Model
clf = ComplementNB()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print("CNB Results")
#Score Model
print(clf.score(X_test, y_test))
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

print("At XGB")
#Create Model
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)

print("XGB RESULTS")
#Score Model
print(accuracy_score(y_test, pred))
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
Example #4
0
 #clf = RandomForestClassifier(verbose=0, random_state=42, n_estimators=100)
 
 clf.fit(x, y)
 
 test_y = []
 for doc in test_freqs.keys():
     if uid in solution[doc]:
         test_y.append(1)
     else:
         test_y.append(0)
         
 test_y = np.array(test_y)
 
 predictions = clf.predict(test_x)
 
 clf.score(test_x, test_y)
 
 true_pos = 0
 false_pos = 0
 false_neg = 0
 true_neg = 0
 
 for idx in range(len(test_y)):
     if predictions[idx] == 1 and test_y[idx] == 1:
         true_pos += 1
     if predictions[idx] == 0 and test_y[idx] == 1:
         false_neg += 1
     if predictions[idx] == 1 and test_y[idx] == 0:
         false_pos += 1
     if predictions[idx] == 0 and test_y[idx] == 0:
         true_neg += 1
    # keep the knn, it's the best
    knn = KNeighborsClassifier()
    knn.fit(train_data_features, y_train)
    knn_preds = knn.predict(test_data_features)
    dump(knn, 'knn.joblib')

    cnb = ComplementNB()
    cnb.fit(train_data_features, y_train)
    cnb_preds = cnb.predict(test_data_features)

    # make df with all preds
    df = pd.DataFrame(
        list(zip(cnb_preds, lr_preds, knn_preds, y_test, x_test)),
        columns=['cnb_preds', 'lr_preds', 'knn_preds', 'category', 'document'])

    # save incorrect predictions in a df to look at
    lr_incorrect = df[df['lr_preds'] != df['category']].copy()
    knn_incorrect = df[df['knn_preds'] != df['category']].copy()
    cnb_incorrect = df[df['cnb_preds'] != df['category']].copy()

    # combine lr and knn incorrects
    two_incorrect = knn_incorrect[
        knn_incorrect['lr_preds'] != knn_incorrect['category']].copy()
    all_incorrect = two_incorrect[
        two_incorrect['cnb_preds'] != two_incorrect['category']].copy()

    print('knn score: ', knn.score(test_data_features, y_test))
    print('log_reg score: ', log_reg.score(test_data_features, y_test))
    print('ComplementNaiveBayes score: ', cnb.score(test_data_features,
                                                    y_test))
Y = numpy.asarray(data[data.columns[-1]])
X = numpy.asarray(data[data.columns[0:-1]])
clf = tree.DecisionTreeClassifier(max_depth=4)
GNB = GaussianNB()
MNB = MultinomialNB()
CNB = ComplementNB()

print('clf')
scores = cross_val_score(clf, X, Y, cv=5)
print(scores)
clf.fit(X, Y)
print(clf.score(X, Y))

print('GNB')
scores = cross_val_score(GNB, X, Y, cv=5)
print(scores)
GNB.fit(X, Y)
print(GNB.score(X, Y))

print('MNB')
scores = cross_val_score(MNB, X, Y, cv=5)
print(scores)
MNB.fit(X, Y)
print(MNB.score(X, Y))

print('CNB')
scores = cross_val_score(CNB, X, Y, cv=5)
print(scores)
CNB.fit(X, Y)
print(CNB.score(X, Y))
# Model Accuracy, how often is the classifier correct?
#%%  Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train,y_train) 

from sklearn.naive_bayes import MultinomialNB
clf1 = MultinomialNB()
clf1.fit(x_train,y_train)

from sklearn.naive_bayes import ComplementNB
clf2 = ComplementNB()
clf2.fit(x_train,y_train)

print("\n","GaussianNB:",nb.score(x_test,y_test),"\n","MultinomialNB:",clf1.score(x_test,y_test),"\n","ComplementNB:",clf2.score(x_test,y_test))
# en uygunu accuracy i yüksek olduğu için GaussianNB seçildi
predictionnb = nb.predict(x_test)
y_prednb = nb.predict(x_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_prednb))
print( confusion_matrix(y_test,y_prednb))
print("GaussianNB")
print(classification_report(y_test,y_prednb))
#%%  Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion="entropy", max_depth=None,min_samples_split=10,max_features=18,random_state=0)
dt = dt.fit(x_train,y_train)
predictiondt = dt.predict(x_test)
y_preddt = dt.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_preddt))