Esempio n. 1
0
def random_forest(sampling=False, isNotebook=False):
    print("=" * 60)
    print("Running Random Forest...")
    DATA_FILE = utils.get_data_directory()

    # The argument of the function will determine weather we use oversampling or not
    if (sampling):
        process_method = preprocess.oversample(DATA_FILE)
    else:
        process_method = preprocess.preprocess_data(DATA_FILE)

    X, y = process_method
    X_train, X_test, y_train, y_test = utils.split_data(X, y, 0.6)
    X_val, X_test, y_val, y_test = utils.split_data(X_test, y_test, 0.5)

    X_grid = np.concatenate((X_train, X_val))
    y_grid = np.concatenate((y_train, y_val))
    separation_boundary = [-1 for _ in y_train] + [0 for _ in y_val]
    ps = PredefinedSplit(separation_boundary)

    param_grid = {
        'n_estimators': [100, 500, 1000],
        'criterion': ['gini', 'entropy'],
        'min_samples_split': [2, 4, 5, 10, 13],
        'min_samples_leaf': [1, 2, 5, 8, 13]
    }

    clf = GridSearchCV(RandomForestClassifier(random_state=0),
                       param_grid,
                       cv=ps)

    model = clf.fit(X_grid, y_grid)
    train_acc = model.score(X_train, y_train)
    val_acc = model.score(X_val, y_val)
    test_acc = model.score(X_test, y_test)
    print(f'training score: {round(train_acc, 3)}')
    print(f'validation score: {round(val_acc, 3)}')
    print(f'testing score: {round(test_acc, 3)}')
    report_dict = classification_report(y_test,
                                        model.predict(X_test),
                                        output_dict=True,
                                        target_names=["No", "Yes"])

    feature_importances = model.best_estimator_.feature_importances_
    top_feature_importances = list(
        sorted(enumerate(feature_importances),
               key=lambda x: x[1],
               reverse=True))

    if isNotebook:
        return top_feature_importances, model
    else:
        utils.display_metrics(report_dict)

    utils.log_results(top_feature_importances)
    utils.generate_report("Random Forest", "Random Forest", model, X_test,
                          y_test, report_dict)
def neural_network(sampling=False, isNotebook=False):
    print("=" * 60)
    print("Neural network...")
    DATA_FILE = utils.get_data_directory()

    # The argument of the function will determine weather we use oversampling or not
    if (sampling):
        process_method = preprocess.oversample(DATA_FILE)
    else:
        process_method = preprocess.preprocess_data(DATA_FILE)

    X, y = process_method
    X_train, X_test, y_train, y_test = utils.split_data(X, y, 0.6)
    X_val, X_test, y_val, y_test = utils.split_data(X_test, y_test, 0.5)

    X_grid = np.concatenate((X_train, X_val))
    y_grid = np.concatenate((y_train, y_val))
    separation_boundary = [-1 for _ in y_train] + [0 for _ in y_val]
    ps = PredefinedSplit(separation_boundary)

    param_grid = {
        'activation': ['logistic', 'identity', 'tanh', 'relu'],
        'hidden_layer_sizes': [(100), (10, 20, 10, 20, 10, 20, 10)],
        'solver': ['adam', 'sgd'],
    }

    clf = GridSearchCV(MLPClassifier(random_state=0), param_grid, cv=ps)

    model = clf.fit(X_grid, y_grid)
    train_acc = model.score(X_train, y_train)
    val_acc = model.score(X_val, y_val)
    test_acc = model.score(X_test, y_test)
    print(f'training score: {round(train_acc, 3)}')
    print(f'validation score: {round(val_acc, 3)}')
    print(f'testing score: {round(test_acc, 3)}')
    report_dict = classification_report(y_test,
                                        model.predict(X_test),
                                        output_dict=True,
                                        target_names=["No", "Yes"])

    imps = permutation_importance(model, X_test, y_test)
    top_feature_importances = list(
        sorted(enumerate(imps.importances_mean),
               key=lambda x: x[1],
               reverse=True))

    if (isNotebook):
        return top_feature_importances, model
    else:
        utils.display_metrics(report_dict)

    utils.log_results(top_feature_importances)
    utils.generate_report("Neural Network", "MLP", model, X_test, y_test,
                          report_dict)
Esempio n. 3
0
def support_vector_machine(sampling = False, isNotebook = False):
    print("="*60)
    print("Running support vector machine...")
    DATA_FILE = utils.get_data_directory()

    # The argument of the function will determine weather we use oversampling or not
    if(sampling):
        process_method = preprocess.oversample(DATA_FILE)
    else:
        process_method = preprocess.preprocess_data(DATA_FILE)

    X, y = process_method
    X_train, X_test, y_train, y_test = utils.split_data(X, y, 0.6)
    X_val, X_test, y_val, y_test = utils.split_data(X_test, y_test, 0.5)

    X_grid = np.concatenate((X_train, X_val))
    y_grid = np.concatenate((y_train, y_val))
    separation_boundary = [-1 for _ in y_train] + [0 for _ in y_val]
    ps = PredefinedSplit(separation_boundary)
    
    param_grid = {
        'C': [1.0, 10.0, 100.0, 1000.0],
        'gamma': [0.01, 0.10, 1.00, 10.00],
        'kernel': ['rbf', 'poly']
    }

    clf = GridSearchCV(SVC(random_state=0, probability=True), param_grid, cv=ps)

    model = clf.fit(X_grid, y_grid)
    train_acc = model.score(X_train, y_train)
    val_acc = model.score(X_val, y_val)
    test_acc = model.score(X_test, y_test)
    print(f'training score: {round(train_acc, 3)}')
    print(f'validation score: {round(val_acc, 3)}')
    print(f'testing score: {round(test_acc, 3)}')
    report_dict = classification_report(y_test, model.predict(X_test), output_dict = True, target_names=["No", "Yes"])

    weights = permutation_importance(model, X_test, y_test)
    top_weights = list(sorted(enumerate(weights.importances_mean), key = lambda x: x[1], reverse = True))

    if isNotebook:
        return top_weights, model
    else:
        utils.display_metrics(report_dict)

    utils.log_results(top_weights)
    utils.generate_report("SVM", "SVM", model, X_test, y_test, report_dict)
def svm_exp():
    print("=" * 60)
    print("Running experiement on SVM...")
    TRAIN_SET = utils.get_data_directory()
    TEST_SET = utils.get_data_directory(fileName="/experiment-dataset.csv")

    X, y = preprocess.oversample(TRAIN_SET)
    X = np.delete(X, slice(4, 13), 1)
    X_train, X_val, y_train, y_val = utils.split_data(X, y, 0.8)
    X_test, y_test = preprocess.preprocess_experiment(TEST_SET)

    X_grid = np.concatenate((X_train, X_val))
    y_grid = np.concatenate((y_train, y_val))
    separation_boundary = [-1 for _ in y_train] + [0 for _ in y_val]
    ps = PredefinedSplit(separation_boundary)

    param_grid = {
        'C': [1.0, 10.0, 100.0, 1000.0],
        'gamma': [0.01, 0.10, 1.00, 10.00],
        'kernel': ['rbf', 'poly']
    }

    print(X_train.shape)
    clf = GridSearchCV(SVC(random_state=0), param_grid, cv=ps)

    model = clf.fit(X_grid, y_grid)
    train_acc = model.score(X_train, y_train)
    val_acc = model.score(X_val, y_val)
    test_acc = model.score(X_test, y_test)
    print(f'training score: {round(train_acc, 3)}')
    print(f'validation score: {round(val_acc, 3)}')
    print(f'testing score: {round(test_acc, 3)}')
    report_dict = classification_report(y_test,
                                        model.predict(X_test),
                                        output_dict=True,
                                        target_names=["No", "Yes"])
    utils.display_metrics(report_dict)

    imps = permutation_importance(model, X_test, y_test)
    top_feature_importances = list(
        sorted(enumerate(imps.importances_mean),
               key=lambda x: x[1],
               reverse=True))
    utils.log_results(top_feature_importances)
    utils.generate_report("Experiment SVM", "Experimental SVM", model, X_test,
                          y_test, report_dict)
def naive_bayes(sampling=False, isNotebook=False):
    print("Running Gaussian Naive Bayes...")
    DATA_FILE = utils.get_data_directory()

    # The argument of the function will determine weather we use oversampling or not
    if (sampling):
        process_method = preprocess.oversample(DATA_FILE)
    else:
        process_method = preprocess.preprocess_data(DATA_FILE)

    X, y = process_method
    X_train, X_test, y_train, y_test = utils.split_data(X, y, 0.8)

    model = clf.fit(X_train, y_train)
    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    print(f'training score: {round(train_acc, 3)}')
    print(f'testing score: {round(test_acc, 3)}')
    report_dict = classification_report(y_test,
                                        model.predict(X_test),
                                        output_dict=True,
                                        target_names=["No", "Yes"])
    '''
    Since GNB does not have a native way of getting feature importances, we use permutation importance.
    Permutation importance works by shuffling features. If shuffling a symptom made the model perform
    worse, then it suggests that this symptom is important. Therefore, it is assigned a postiive value.
    '''
    imps = permutation_importance(model, X_test, y_test)
    features = utils.get_feature_names()
    feat_imp = list(
        sorted(enumerate(imps.importances_mean),
               key=lambda x: x[1],
               reverse=True))

    if isNotebook:
        return feat_imp
    else:
        utils.display_metrics(report_dict)

    utils.log_results(feat_imp)
    utils.generate_report("GNB", "Naive Bayes", model, X_test, y_test,
                          report_dict)