Ejemplo n.º 1
0
def k_nearest_neighbors(json_data):
    df = pd.DataFrame(data=json_data)

    x = df.drop('type', axis=1)
    y = df['type']

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.05,
                                                        random_state=0)

    scaler = StandardScaler()
    scaler.fit(x_train)

    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)

    classifier = KNeighborsClassifier(n_neighbors=5)
    classifier.fit(x_train, y_train)

    y_pred = classifier.predict(x_test)

    print('K-nearest neighbors classifier:')
    print(classification_report(y_test, y_pred))
    plot_confusion_matrix(y_true=y_test,
                          y_pred=y_pred,
                          classes=y.unique(),
                          title='K-nearest neighbors classifier')
Ejemplo n.º 2
0
def linear_svc_classifier(json_data):
    df = pd.DataFrame(data=json_data)

    x = df.drop('type', axis=1)
    y = df['type']

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.05,
                                                        random_state=0)

    classifier = LinearSVC(C=1.0,
                           class_weight=None,
                           dual=False,
                           fit_intercept=True,
                           intercept_scaling=1,
                           loss='squared_hinge',
                           max_iter=2500,
                           multi_class='ovr',
                           penalty='l2',
                           random_state=0,
                           tol=1e-05,
                           verbose=0)

    classifier.fit(x_train, y_train)

    y_pred = classifier.predict(x_test)

    print('Linear SVC classifier:')
    print(classification_report(y_test, y_pred))
    plot_confusion_matrix(y_true=y_test,
                          y_pred=y_pred,
                          classes=y.unique(),
                          title='Linear SVM classifier')
Ejemplo n.º 3
0
def plot_conf_matrix(cnf_matrix):
     np.set_printoptions(precision=2)
     plt.figure()
     h.plot_confusion_matrix(cnf_matrix, classes=output_class,
                      title='Confusion matrix, without normalization')

     plt.figure()
     h.plot_confusion_matrix(cnf_matrix, classes=output_class,
                normalize=True, title='Normalized Confusion matrix')
     plt.show()
Ejemplo n.º 4
0
def create_model(X,
                 y,
                 test_size=0.25,
                 classifier_obj=GaussianNB,
                 show_plot=False,
                 save_model=False):
    import pickle
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import confusion_matrix, accuracy_score

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=0)

    # Training the Naive Bayes model on the Training set
    model = classifier_obj()
    model.fit(X_train, y_train)

    # Predicting the Test set results, calc confusion matrix
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred, labels=[1, 2, 3, 4])
    accuracy = accuracy_score(y_test, y_pred)

    # Plot confusion matrix
    if show_plot:
        from helpers import plot_confusion_matrix
        plot_confusion_matrix(cm, classes=[1, 2, 3, 4])

    if save_model:
        from datetime import datetime
        dt = datetime.utcnow().strftime('%Y-%m-%d_%H-%M-%S')
        filename = 'model-{}.sav'.format(dt)
        pickle.dump(model, open(filename, 'wb'))

    print('====== RESULTS FOR MODEL ======')
    print('y_test:\t{}'.format(y_test))
    print('y_pred:\t{}'.format(y_pred))
    print('accuracy\t{}'.format(accuracy))
    print('confusion matrix:\n{}'.format(cm))
    print('====== END OF RESULTS =========')

    return [model, y_pred, cm, accuracy]
Ejemplo n.º 5
0
def sgd_classifier(json_data):
    df = pd.DataFrame(data=json_data)

    x = df.drop('type', axis=1)
    y = df['type']

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.05,
                                                        random_state=0)

    classifier = SGDClassifier(alpha=0.0001,
                               average=False,
                               class_weight=None,
                               early_stopping=False,
                               epsilon=0.1,
                               eta0=0.0,
                               fit_intercept=True,
                               l1_ratio=0.5,
                               learning_rate='optimal',
                               loss='log',
                               max_iter=2500,
                               n_iter_no_change=5,
                               penalty='l2',
                               power_t=0.9,
                               random_state=None,
                               shuffle=False,
                               tol=0.00001,
                               validation_fraction=0.1,
                               verbose=0,
                               warm_start=False)

    classifier.fit(x_train, y_train)

    y_pred = classifier.predict(x_test)

    print('SGD Classifier:')
    print(classification_report(y_test, y_pred))
    plot_confusion_matrix(y_true=y_test,
                          y_pred=y_pred,
                          classes=y.unique(),
                          title='SGD Classifier')
Ejemplo n.º 6
0
def decision_tree_classifier(json_data):
    df = pd.DataFrame(data=json_data)

    x = df.drop('type', axis=1)
    y = df['type']

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.05,
                                                        random_state=0)

    classifier = DecisionTreeClassifier()
    classifier.fit(x_train, y_train)

    y_pred = classifier.predict(x_test)

    print('Decision tree classifier:')
    print(classification_report(y_test, y_pred))
    plot_confusion_matrix(y_true=y_test,
                          y_pred=y_pred,
                          classes=y.unique(),
                          title='Decision tree classifier')
Ejemplo n.º 7
0
def svm_classifier(json_data):
    classifier = SVC(gamma='scale', decision_function_shape='ovo')

    df = pd.DataFrame(data=json_data)

    x = df.drop('type', axis=1)
    y = df['type']

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.05,
                                                        random_state=0)

    classifier.fit(x_train, y_train)

    y_pred = classifier.predict(x_test)

    print('SVM classifier:')
    print(classification_report(y_test, y_pred))
    plot_confusion_matrix(y_true=y_test,
                          y_pred=y_pred,
                          classes=y.unique(),
                          title='SVM classifier')
Ejemplo n.º 8
0
def gaussian_naive_bayes(json_data):
    df = pd.DataFrame(data=json_data)

    x = df.drop('type', axis=1)
    y = df['type']

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.05,
                                                        random_state=0)

    classifier = GaussianNB()

    classifier.fit(x_train, y_train)
    GaussianNB(priors=None, var_smoothing=1e-09)

    y_pred = classifier.predict(x_test)

    print('Gaussian Naive Bayes Classifier:')
    print(classification_report(y_test, y_pred))
    plot_confusion_matrix(y_true=y_test,
                          y_pred=y_pred,
                          classes=y.unique(),
                          title='Gaussian Naive Bayes classifier')
Ejemplo n.º 9
0
        print("Classification Report With Dummy (n = {0}):\n{1}".format(
            dummy_n, dummy_metrics['classification_report']))
        print("Train Accuracy With Mapping (n = {0}):{1}".format(
            mapping_n, mapping_metrics['accuracy']['train']))
        print("Test Accuracy With Mapping (n = {0}):{1}".format(
            mapping_n, mapping_metrics['accuracy']['test']))
        print("Confusion Matrix With Mapping (n = {0}):\n{1}".format(
            mapping_n, mapping_metrics['confusion_matrix']))
        print("Classification Report With Mapping (n = {0}):\n{1}".format(
            mapping_n, mapping_metrics['classification_report']))
        plt.figure()
        plt.plot(dummy_costs, label='dummy')
        plt.plot(mapping_costs, label='mapping')
        plt.legend()
        plt.figure()
        h.plot_confusion_matrix(dummy_metrics['confusion_matrix'],
                                "Confusion Matrix With Dummy")
        plt.figure()
        h.plot_confusion_matrix(mapping_metrics['confusion_matrix'],
                                "Confusion Matrix With Mapping")

    if dummy_metrics['accuracy']['test'] > mapping_metrics['accuracy']['test']:
        print("Predicting with dummy variables")
        _, df = h.clean_with_dummy(args[1] if len(args) > 1 else 'test.csv',
                                   dummy_params)
        df = df.drop(['Survived'], axis=1)
        df = h.predict_knn(df, dummy_knn)
    else:
        print("Predicting with mapping")
        _, df = h.clean_with_mapping(args[1] if len(args) > 1 else 'test.csv',
                                     mapping_params)
        df = h.predict_knn(df, mapping_knn)
        # plot log loss
        fig, ax = plt.subplots()
        ax.plot(x_axis, train_metrics['mlogloss'], label='Train')
        ax.plot(x_axis, test_metrics['mlogloss'], label='Test')
        ax.legend()
        plt.ylabel('Log Loss')
        plt.title('{} - Log Loss'.format(args.model))
        plt.savefig("img/logloss_{}.png".format(uid))
        plt.show()
        # plot classification error
        fig, ax = plt.subplots()
        ax.plot(x_axis, train_metrics['merror'], label='Train')
        ax.plot(x_axis, test_metrics['merror'], label='Test')
        ax.legend()
        plt.ylabel('Error')
        plt.title('{} - Error'.format(args.model))
        plt.savefig("img/error_{}.png".format(uid))
        plt.show()

    # Confusion matrix
    plot_confusion_matrix(eval_predicted, eval_labels, args.model, uid)

    # AUC ROC scores
    compute_AUC_scores(eval_predicted_proba, eval_labels)

# Save Kaggle submission files
construct_kaggle_submissions(uid, results, results_proba)

model_file = open("models/{}.mdl".format(uid), "wb")
pickle.dump(model, model_file)
Ejemplo n.º 11
0
# plot_model(model, to_file=MODEL_SAVE_PATH, show_shapes=True, rankdir='LR')
# ===========================================================================
# Training the networks
# ===========================================================================
# 2 classes: 'binary_crossentropy'
# mean squared error
model.compile(loss='categorical_crossentropy',
              optimizer=OPTIMIZER,
              metrics=['accuracy'])

print('Train...')
model.fit(X_train, y_train,
          batch_size=BATCH_SIZE, epochs=NUM_EPOCH,
          validation_data=(X_valid, y_valid))
# ====== evaluation ====== #
y_test_pred_proba = model.predict_proba(X_test, batch_size=BATCH_SIZE * 2)
y_test = np.argmax(y_test, axis=-1)
y_test_pred = np.argmax(y_test_pred_proba, axis=-1)
print("Accuracy:", accuracy_score(y_true=y_test, y_pred=y_test_pred))
print("F1 score:", f1_score(y_true=y_test, y_pred=y_test_pred,
                            average='micro', labels=digit.numbers))
print(classification_report(y_true=y_test, y_pred=y_test_pred, labels=digit.numbers))
# ====== plot confusion matrix ====== #
plt.figure()
cm = confusion_matrix(y_true=y_test, y_pred=y_test_pred, labels=digit.numbers)
plot_confusion_matrix(cm=cm, labels=digit.numbers, colorbar=True, fontsize=6)
# ===========================================================================
# Cleaning
# ===========================================================================
plot_save(path=FIGURE_SAVE_PATH)
Ejemplo n.º 12
0
from sklearn.model_selection import cross_val_score

import helpers
from MlcLinReg import MlcLinReg

X_train, y_train, X_test, y_test = helpers.load_delicious(2)
helpers.plot_confusion_matrix(MlcLinReg(learning_rate=0.5,
                                        iterations=2000,
                                        batch_size=1,
                                        l_one=0.1),
                              X=X_train.toarray(),
                              y=y_train.toarray())
print cross_val_score(estimator=MlcLinReg(learning_rate=0.5,
                                          iterations=2000,
                                          batch_size=1,
                                          l_one=0.1),
                      X=X_train.toarray(),
                      y=y_train.toarray(),
                      cv=20).mean()
Ejemplo n.º 13
0
total = 0
okays = 0

for i in range(y_pred.shape[0]):
    total += 1
    if (y_pred[i] == Y_test[i]):
        okays += 1

print("total acc: ", 100 * okays / total)
print("correct classifications: ", okays)
print("errors: ", total - okays)

#-----------------------------------------------------------------------------
#plot confusion matrix

confusionMatrix = np.zeros((num_classes, num_classes), dtype='int')

for i in range(0, Y_test.shape[0] - 1):

    confusionMatrix[int(Y_test[i])][y_pred[i]] += 1

dict_labels = pickle.load(open('dictLabels.p', "rb"))

names = list()

for i in range(0, num_classes):
    names.append(dict_labels[i])

plot_confusion_matrix(confusionMatrix, normalize=False, target_names=names)
pickle.dump(confusionMatrix, open(conf_mat_patches_saved, "wb"))
Ejemplo n.º 14
0
accuracy_scores = cross_val_score(model,
                                  train_data,
                                  train_labels,
                                  cv=kfold,
                                  scoring="accuracy")
print("Cross validation logloss scores: {:.5f} {:.5f}".format(
    np.mean(scores), np.std(scores)))
print("Cross validation accuracy scores: {:.5f} {:.5f}".format(
    np.mean(accuracy_scores), np.std(accuracy_scores)))

# Save Kaggle submission files
results = model.predict(test_data)  # Predicts from 1-10
results_proba = model.predict_proba(test_data)
construct_kaggle_submissions(ensemble['uid'] + "_Final=" + str(args.final),
                             results, results_proba)

model_file = open(
    "models/{}.mdl".format(ensemble['uid'] + "_Final=" + str(args.final)),
    "wb")
pickle.dump(model, model_file)

if not args.final:
    # Confusion matrix
    eval_predicted = model.predict(eval_data)
    plot_confusion_matrix(eval_predicted, eval_labels, ensemble['title'],
                          ensemble['uid'])

    # AUC ROC scores
    eval_predicted_proba = model.predict_proba(eval_data)
    compute_AUC_scores(eval_predicted_proba, eval_labels)
Ejemplo n.º 15
0
# TEST MODEL ===========================================================================================================
    all_predictions = model.predict(all_gen)
    test_predictions = model.predict(test_gen)

    all_node_predictions = target_encoding.inverse_transform(all_predictions.squeeze())
    test_node_predictions = target_encoding.inverse_transform(test_predictions.squeeze())

    all_predictions_df = pd.DataFrame({"Predicted": all_node_predictions,
                                       "True": graph_labels,
                                       "Fraud IDs": fraud_ids}, index=graph_labels.index)
    all_predictions_df['is_correct'] = all_predictions_df['Predicted'] == all_predictions_df['True']
    all_predictions_df.to_csv(cfg.STORAGE_ROOT_PATH + rf'\results_all_gcn_node.csv', sep=';')

    all_identified_cases = all_predictions_df[all_predictions_df['is_correct'] == True]['Fraud IDs']
    print('IDENTIFIED IN ALL: ', functools.reduce(aggregate_sets, all_identified_cases.dropna(), set()))

    test_predictions_df = pd.DataFrame({"Predicted": test_node_predictions,
                                        "True": target_encoding.inverse_transform(test_targets.squeeze()),
                                        "Fraud IDs": test_fraud_ids}, index=test_subjects.index)
    test_predictions_df['is_correct'] = test_predictions_df['Predicted'] == test_predictions_df['True']
    test_predictions_df.to_csv(cfg.STORAGE_ROOT_PATH + rf'\results_test_gcn_node.csv', sep=';')

    test_identified_cases = test_predictions_df[test_predictions_df['is_correct'] == True]['Fraud IDs']
    print('IDENTIFIED IN ALL: ', functools.reduce(aggregate_sets, test_identified_cases.dropna(), set()))

    plot_confusion_matrix('Confusion Matrix - All Data', all_node_predictions, graph_labels.tolist(),
                          cfg.STORAGE_BASE_THESIS_IMG + rf'\conf_matrix_all_gcn_node.pdf')
    plot_confusion_matrix('Confusion Matrix - Test Data', test_node_predictions, target_encoding.inverse_transform(test_targets.squeeze()),
                          cfg.STORAGE_BASE_THESIS_IMG + rf'\conf_matrix_test_gcn_node.pdf')