def test_data_split_identical():
    X_, y_ = X[:40], np.array([0] * 20 + [1] * 20)
    for md in [
            model.SVM(),
            model.KNN(),
            model.XGBoost(),
            model.LinearModel(),
            model.DecisionTree()
    ]:
        a = evaluation.estimate(md, X_train, X_test, y_train, y_test)
        b = evaluation.estimate(md, X_train, X_test, y_train, y_test)
        assert a == b
        a = evaluation.cross_validation(md,
                                        X_,
                                        y_,
                                        scoring='both',
                                        n_splits=2,
                                        n_jobs=1)
        b = evaluation.cross_validation(md,
                                        X_,
                                        y_,
                                        scoring='both',
                                        n_splits=2,
                                        n_jobs=1)
        assert np.all(a['f1'] == b['f1'])
        assert np.all(a['roc_auc'] == b['roc_auc'])
def test_estimate():
    for md in [
            model.SVM(),
            model.KNN(),
            model.XGBoost(),
            model.LinearModel(),
            model.DecisionTree()
    ]:
        evaluation.estimate(md, X_train, X_test, y_train, y_test)
Exemple #3
0
def test_clone():
    models = [
        model.LinearModel(),
        model.SVM(),
        model.DecisionTree(),
        model.MultiClassesLearner('KNN', {'n_neighbors': 1}),
        model.KNN(),
        model.XGBoost(),
    ]
    from sklearn.base import clone
    for md in models:
        clone(md)
Exemple #4
0
def test_best_param_search():
    md = model.SVM()
    best_params, result = evaluation.best_param_search(
        md,
        X=X,
        y=y,
        params=[
            {'C': [0.01, 0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid']},
            {'gamma': [0.005, 0.0125, 0.02, 0.04, 0.08, 0.1]}
        ],
        n_jobs=1
    )
    for i in range(len(result) - 1):
        assert result['test_score'].diff().abs().sum() > 0.0001
    print(best_params)
    print(result)
def test_cross_validation():
    for md in [
            model.SVM(),
            model.MultiClassesLearner('KNN', {'n_neighbors': 1}),
            model.KNN(),
            model.XGBoost(),
            model.LinearModel(),
            model.DecisionTree(),
    ]:
        try:
            evaluation.cross_validation(md,
                                        X,
                                        y,
                                        scoring='both',
                                        n_jobs=1,
                                        n_splits=2)
        except Exception as e:
            print(md.__class__)
            raise e
Exemple #6
0
def test_n_columns(df, low, high):
    for n_columns in range(low, high):
        data.convert_data_to_dummies('mushrooms.csv', n_columns=n_columns)
        features = pd.read_csv('features.csv', index_col=0)
        labels = pd.read_csv('labels.csv', index_col=0)
        train, test, train_output, test_output = data.train_test_val_split(
            features, labels)
        #

        # print(acc)

        svm = model.SVM()
        svm.fit(train.values, train_output.values[:, 0])
        result = svm.predict(test.values)

        acc = np.sum(result == test_output.values[:, 0]) / len(result)

        print(n_columns, acc, df.columns[n_columns])
        print()
Exemple #7
0
def test_get_params():
    models = [
        model.LinearModel(),
        model.SVM(),
        model.DecisionTree(),
        model.MultiClassesLearner('KNN', {'n_neighbors': 1}),
        model.KNN(),
        model.XGBoost(),
    ]

    for md in models:
        params = md.get_params()
        try:
            assert 'normalizer_name' in params
            assert 'sample_method' in params
            if not isinstance(md, model.KNN):
                assert 'balanced_learning' in params
        except AssertionError as e:
            print(params)
            print(md.__class__)
            raise e
def main():
    utils.local_css("css/styles.css")
    st.title("Heart Disease Prediction - Manual Parameter Tuner")
    st.sidebar.title("Manual Parameter Tuning")
    st.markdown("### Machine Learning is not only about the algorithms you use but also about the different Parameters "
                "assigned to each of them. The final model is heavily affected by the parameters used for a specific "
                "algorithm. "
                "\nThis interactive web app will help you to explore the various parameters of different ML algorithms."
                "\nThe different ML models presented here are:"
                "\n* Logistic Regression"
                "\n* Support Vector Classifier"
                "\n* k-Nears Neighbour Classifier"
                "\n* Decision Tree Classifier"
                "\n* Random Forest Classifier"
                "\n* Gradient Boosting Classifier"
                "\n* XGBoost Classifier"
                "\n### The dataset used here is the **Framingham** Coronary Heart Disease dataset publicly available "
                "at [Kaggle](https://www.kaggle.com/amanajmera1/framingham-heart-study-dataset)."
                "\n## About the Dataset:"
                "\nThe **Framingham** dataset is from an ongoing cardiovascular study"
                " on residents of the town of Framingham, Massachusetts. The classification goal is "
                "to predict whether the patient has 10-year risk of future coronary heart disease (CHD). The dataset "
                "provides the patients’ information. It includes over 4,240 records and 15 attributes."
                "\n ### Even after optimizing parameters, the model would only work properly if accurate data is provided to it."
                " So, through this web app, the users will be able to get a feel of hyperparameter tuning but only on this specific dataset."
                "\n ## Head to the *Manual Parameter Tuning* section to get started!")

    st.sidebar.markdown("Manually select the model you want to view and use the interactive text boxes, sliding bars "
                        "and buttons to tune the respective models. More than one options are provided for each model"
                        " and you can view and gain insight on how hyper-parameter tuning works. Enjoy exploring!")

    data = pd.read_csv("Dataset/framingham.csv")
    data = utils.preprocess(data)

    st.sidebar.markdown("\n#### Exploratory Data Analysis:")
    viz_list = st.sidebar.multiselect("(Be sure to Clear off all the selected options here before moving on for faster response)",
                                      ('Categorical Visualisation',
                                       'Numerical Visualisation',
                                       'sysBP and diaBP Visualisation'))
    utils.visualize(viz_list, data)
    if st.sidebar.checkbox("View raw and preprocessed data", False):
        st.subheader("Raw-preprocessed Data")
        st.write(data)

    st.sidebar.markdown("\n#### Feature Selection:")
    feature = st.sidebar.radio("Feature selection using chi-squared test", ("Don't select features",
                                                                            "Select Features"), key='feature')
    if feature == "Don't select features":
        st.markdown("### Feature Selection is not done!")
    else:
        st.markdown("Best 10 features along with their chi-squared score")
        score, data = utils.feature_selection(data)
        st.write(score)
        if st.sidebar.checkbox("Plot Feature Selection", False):
            utils.plot_feature_selection(score)


    train_x, test_x, train_y, test_y = utils.split_and_scale(data)

    class_names = ["Has Heart Disease", "Doesn't have Heart Disease"]
    st.sidebar.subheader("Choose Classifier")
    classifier = st.sidebar.selectbox("Classifier", ("Logistic Regression",
                                                     "Support Vector Classifier",
                                                     "k-Nears Neighbour Classifier",
                                                     "Decision Tree Classifier",
                                                     "Random Forest Classifier",
                                                     "Gradient Boosting Classifier",
                                                     "XGBoost Classifier"))

    if classifier == "Logistic Regression":
        st.sidebar.subheader("Model Hyperparameters")
        C = st.sidebar.number_input("C (Regularization parameter)", 0.01, 10.0, step=0.01, key='Lr')
        max_iter = st.sidebar.slider("Maximum no. of Iterations", 100, 500, key='max_iter')

        metrics = st.sidebar.multiselect("What matrix to plot?", ("Confusion Matrix", "ROC Curve",
                                                                  "Precision-Recall Curve"))

        if st.sidebar.button("Classify", key="classify"):
            st.subheader("Logistic Regression Results")
            y_pred, accuracy, models = model.LR(train_x, test_x, train_y, test_y, C=C, max_iter=max_iter)
            st.write("Accuracy: ", accuracy.round(3))
            st.write("Precision: ", precision_score(test_y, y_pred, labels=class_names).round(3))
            st.write("Recall: ", recall_score(test_y, y_pred, labels=class_names).round(3))
            utils.plot_metrics(metrics, models, test_x, test_y, class_names)

    if classifier == "Support Vector Classifier":
        st.sidebar.subheader("Model Hyperparameters")
        C = st.sidebar.number_input("C (Regularization parameter)", 0.01, 10.0, step=0.01, key='C')
        gamma = st.sidebar.radio("Gamma (for non linear hyperplanes)", ("auto", "scale"), key='gamma')
        kernel = st.sidebar.radio("Kernel (type of hyperplane)", ("linear", "rbf", "poly"), key='kernel')
        degree = 3
        if kernel == 'poly':
            degree = st.sidebar.number_input("Degree of the polynomial used to find the hyperplane", 1, 10, step=1,
                                             key='degree')
        metrics = st.sidebar.multiselect("What matrix to plot?", ("Confusion Matrix", "ROC Curve",
                                                                  "Precision-Recall Curve"))

        if st.sidebar.button("Classify", key="classify"):
            st.subheader("Support Vector Classification Results")
            y_pred, accuracy, models = model.SVM(train_x, test_x, train_y, test_y, C=C, gamma=gamma, kernel=kernel,
                                                 degree=degree)
            st.write("Accuracy: ", accuracy.round(3))
            st.write("Precision: ", precision_score(test_y, y_pred, labels=class_names).round(3))
            st.write("Recall: ", recall_score(test_y, y_pred, labels=class_names).round(3))
            utils.plot_metrics(metrics, models, test_x, test_y, class_names)

    if classifier == "k-Nears Neighbour Classifier":
        st.sidebar.subheader("Model Hyperparameters")
        n = st.sidebar.number_input("n_neighbors (Number of nearest neighbors)", 1, 20, step=1, key='n')
        leaf_size = st.sidebar.slider("Leaf Size", 10, 200, key='leaf_size')
        algorithm = st.sidebar.radio("Algorithm to use", ("ball_tree", "kd_tree", "auto"), key='algorithm')

        metrics = st.sidebar.multiselect("What matrix to plot?", ("Confusion Matrix", "ROC Curve",
                                                                  "Precision-Recall Curve"))

        if st.sidebar.button("Classify", key="classify"):
            st.subheader("kNN Classification Results")
            y_pred, accuracy, models = model.KNN(train_x, test_x, train_y, test_y, n=n, leaf_size=leaf_size,
                                                 algorithm=algorithm)
            st.write("Accuracy: ", accuracy.round(3))
            st.write("Precision: ", precision_score(test_y, y_pred, labels=class_names).round(3))
            st.write("Recall: ", recall_score(test_y, y_pred, labels=class_names).round(3))
            utils.plot_metrics(metrics, models, test_x, test_y, class_names)

    if classifier == "Decision Tree Classifier":
        criterion = st.sidebar.radio("Criterion of splitting trees", ("gini", "entropy"), key='criterion')
        max_depth = st.sidebar.slider("Max depth of the tree", 1, 50, key='amx_depth')
        min_samples_leaf = st.sidebar.number_input("Minimum Leaf Samples", 1, 10, step=1, key='min_samples_leaf')
        max_features = st.sidebar.radio("No. of features to consider during best split", ("auto", "sqrt", "log2"),
                                        key='max_features')

        metrics = st.sidebar.multiselect("What matrix to plot?", ("Confusion Matrix", "ROC Curve",
                                                                  "Precision-Recall Curve"))

        if st.sidebar.button("Classify", key="classify"):
            st.subheader("Decision Tree Classification Results")
            y_pred, accuracy, models = model.DT(train_x, test_x, train_y, test_y, criterion=criterion,
                                                max_depth=max_depth,
                                                leaf=min_samples_leaf, max_features=max_features)
            st.write("Accuracy: ", accuracy.round(3))
            st.write("Precision: ", precision_score(test_y, y_pred, labels=class_names).round(3))
            st.write("Recall: ", recall_score(test_y, y_pred, labels=class_names).round(3))
            utils.plot_metrics(metrics, models, test_x, test_y, class_names)

    if classifier == "Random Forest Classifier":
        st.sidebar.subheader("Model Hyperparameters")
        n_estimators = st.sidebar.slider("Number of Trees in the Random Forest", 100, 4000, key='n_estimators')
        max_depth = st.sidebar.number_input("The maximum depth of the tree", 1, 100, step=5, key='max_depth')
        bootstrap = st.sidebar.radio("Bootstrap samples when building trees", ("True", "False"), key='bootstrap')

        metrics = st.sidebar.multiselect("What matrix to plot?", ("Confusion Matrix", "ROC Curve",
                                                                  "Precision-Recall Curve"))

        if st.sidebar.button("Classify", key="classify"):
            st.subheader("Random Forest Classification Results")
            y_pred, accuracy, models = model.RF(train_x, test_x, train_y, test_y, n_estimators=n_estimators,
                                                max_depth=max_depth, bootstrap=bootstrap)
            st.write("Accuracy: ", accuracy.round(3))
            st.write("Precision: ", precision_score(test_y, y_pred, labels=class_names).round(3))
            st.write("Recall: ", recall_score(test_y, y_pred, labels=class_names).round(3))
            utils.plot_metrics(metrics, models, test_x, test_y, class_names)

    if classifier == "Gradient Boosting Classifier":
        st.sidebar.subheader("Model Hyperparameters")
        n_estimators = st.sidebar.slider("Number of Trees in the Gradient Boost ensemble", 100, 4000,
                                         key='n_estimators')
        max_depth = st.sidebar.number_input("The maximum depth of the tree", 1, 100, step=5, key='max_depth')
        learning_rate = st.sidebar.number_input("Learning Rate", 0.01, 10.0, step=0.01, key='learning_rate')
        warm_start = st.sidebar.radio("Reuse previous solution for more ensemble", ("True", "False"), key='warm_start')

        metrics = st.sidebar.multiselect("What matrix to plot?", ("Confusion Matrix", "ROC Curve",
                                                                  "Precision-Recall Curve"))

        if st.sidebar.button("Classify", key="classify"):
            st.subheader("Gradient Boosting Classification Results")
            y_pred, accuracy, models = model.GBC(train_x, test_x, train_y, test_y, n_estimators=n_estimators,
                                                 max_depth=max_depth, learning_rate=learning_rate,
                                                 warm_start=warm_start)
            st.write("Accuracy: ", accuracy.round(3))
            st.write("Precision: ", precision_score(test_y, y_pred, labels=class_names).round(3))
            st.write("Recall: ", recall_score(test_y, y_pred, labels=class_names).round(3))
            utils.plot_metrics(metrics, models, test_x, test_y, class_names)

    if classifier == "XGBoost Classifier":
        st.sidebar.subheader("Model Hyperparameters")
        n_estimators = st.sidebar.slider("Number of Trees in the XGBoost ensemble", 100, 4000, key='n_estimators')
        max_depth = st.sidebar.number_input("The maximum depth of the tree", 1, 100, step=5, key='max_depth')
        eta = st.sidebar.number_input("Learning Rate", 0.01, 10.0, step=0.01, key='eta')
        colsample_bytree = st.sidebar.number_input("Percentage of features used per tree", 0.01, 1.0, step=0.01,
                                                   key='colsample_bytree')
        reg_alpha = st.sidebar.number_input("L1 regularization on leaf weights", 1, 10, step=1, key='reg_alpha')
        reg_lambda = st.sidebar.number_input("L2 regularization on leaf weights", 1, 10, step=1, key='reg_lambda')

        metrics = st.sidebar.multiselect("What matrix to plot?", ("Confusion Matrix", "ROC Curve",
                                                                  "Precision-Recall Curve"))

        if st.sidebar.button("Classify", key="classify"):
            st.subheader("Extreme Gradient Boosting(XGBoost) Classification Results")
            y_pred, accuracy, models = model.XGB(train_x, test_x, train_y, test_y, n_estimators=n_estimators,
                                                 max_depth=max_depth, eta=eta, colsample_bytree=colsample_bytree,
                                                 reg_alpha=reg_alpha, reg_lambda=reg_lambda)
            st.write("Accuracy: ", accuracy.round(3))
            st.write("Precision: ", precision_score(test_y, y_pred, labels=class_names).round(3))
            st.write("Recall: ", recall_score(test_y, y_pred, labels=class_names).round(3))
            utils.plot_metrics(metrics, models, test_x, test_y, class_names)
# Imports
import numpy as np
import pandas as pd
import csv
import random
import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from keras.models import Model, load_model
from keras.layers import Input, Dense, Dropout
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
import model as m

model = m.SVM()
log_data, results = model.getselectedpartialData(
    "logs/mixed_preprocess_logs.csv", anomalies=0.001, num=10000)
model.processData()
model.shuffleData()
model.fitTransform()
#model.partialData(50000)
scores = model.crossValidationModel(k="linear", C=2, cv=10)
print(scores)
print(scores["test_acc"].mean())
print(scores["test_prec"].mean())
print(scores["test_rec"].mean())
exit()
#model.traintestSplit(testsize=0.2)
#model.model(k="rbf", C_val=20)
#print(model.testModel())
#model.testRows(1)
"""
Exemple #10
0
while(data is None):
   user_selection = int(input("Select a dataset : "))
   if user_selection == 1:
       data = "WSKT.JK.csv"
   elif user_selection == 2:
       data = "BBCA.JK.csv"
   elif user_selection == 3:
       data = "BNGA.JK.csv"
   else:
       print("Please select a valid option!")
       continue
   

print("1 : Linear Regression")
print("2 : Random Forest")
print("3 : Support Vector Machine")
while(algorithm is None):
   selected_algorithm = int(input("Select an algorithm : "))
   if selected_algorithm == 1:
       algorithm = selected_algorithm
       model.linearRegressor(data)
   elif selected_algorithm == 2:
       algorithm = selected_algorithm
       model.randForestRegressor(data)
   elif selected_algorithm == 3:
       algorithm = selected_algorithm
       model.SVM(data)
   else:
       print("Please select a valid option!")
       continue
# coding:utf-8

import model

# 测试文件位置
testfile = 'data/train/poc_test.txt'

# SVM模型预测
a = model.SVM()
# LogisticRegression模型预测
# a = model.LG()

with open(testfile, 'r') as f:
    print('Testfile: ' + testfile)
    preicdtlist = [i.strip('\n') for i in f.readlines()[:]]
    result = a.predict(preicdtlist)
    print('First 10 Malicious Requests: ' + str(result[1][:10]))
    print('First 10 Normal Requests: ' + str(result[0][:10]))
    pass