Esempio n. 1
0
def run_model():
    # Load data
    X_train_scaled, y_train, X_test_scaled, y_test = clean_data(scaled=1)

    # Build the naive bayes models
    gaussianNB = GaussianNB()
    bernoulliNB = BernoulliNB()
    multinomialNB = MultinomialNB()

    for nb in [gaussianNB, bernoulliNB, multinomialNB]:
        print(f'Working {str(nb)}\n')
        nb.fit(X_train_scaled, y_train)
        y_pred = nb.predict(X_test_scaled)
        report = classification_report(y_true=y_test, y_pred=y_pred)
        print(report)
        print(confusion_matrix(y_true=y_test, y_pred=y_pred))

        # Confusion matrix
        CM = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[0, 1])
        cm(cm=CM,
           target_names=['Burpee no jump', 'Burpee'],
           title=str(nb) + ' CM')

        # ROC
        ROC(Model=nb, Y_test=y_test, X_test=X_test_scaled)
Esempio n. 2
0
def run_model():
    # Load data
    X_train_scaled, y_train, X_test_scaled, y_test = clean_data(scaled=1)

    # Create logistic regresion
    logistic = LogisticRegression(solver='liblinear',
                                  max_iter=150,
                                  random_state=42)

    # Create regularization penalty space
    penalty = ['l1', 'l2']
    # Create regularization hyperparameter space
    # First run in logspace found C to be ~10
    C = np.linspace(8, 12, 30)
    # Create hyperparameter options
    hyperparameters = dict(C=C, penalty=penalty)

    # Create grid search using 5 split stratisfied shuffle split cross validation
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    clf = GridSearchCV(logistic,
                       hyperparameters,
                       cv=cv,
                       verbose=0,
                       scoring='precision',
                       n_jobs=-1)

    # Fit grid search
    best_model = clf.fit(X_train_scaled, y_train)

    # View best hyperparameters
    print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
    print('Best C:', best_model.best_estimator_.get_params()['C'])
    print(f'Best training score {best_model.best_score_}')

    # Make a prediction on entire training set
    y_pred = best_model.best_estimator_.predict(X_test_scaled)

    # Classification report showing precision,
    report = classification_report(y_true=y_test, y_pred=y_pred)
    print(report)

    # Display confusion matrix
    CM = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[0, 1])
    cm(cm=CM,
       target_names=['Burpee no jump', 'Burpee'],
       title='Logistric Regression CM')

    # Plot a bar graph of the variables to get insight in importance
    bar_coef(Model=best_model, X_train_scaled=X_test_scaled)

    # Plot ROC curve
    ROC(Model=best_model.best_estimator_, Y_test=y_test, X_test=X_test_scaled)
Esempio n. 3
0
def run_model():
    # Load data
    X_train_scaled, y_train, X_test_scaled, y_test = clean_data(scaled=1)

    # Create SVC
    svc = SVC(max_iter=-1, probability=True)

    # Create diff kernels
    kernel = ['rbf', 'linear', 'poly', 'sigmoid']

    # Degree for poly kernels
    degree = np.arange(start=1, stop=5, step=1)
    # Gamma for poly kernels
    gamma = np.logspace(start=-15, stop=4, num=18, base=2)
    # C penalty factor
    C = np.logspace(start=-3, stop=16, num=18, base=2)

    # Create hyperparameter options
    hyperparameters = dict(kernel=kernel, degree=degree, gamma=gamma, C=C)

    # Create grid search using 5 split stratisfied shuffle split cross validation
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
    clf = GridSearchCV(svc,
                       hyperparameters,
                       cv=cv,
                       verbose=1,
                       scoring='precision',
                       n_jobs=-1)

    # Fit grid search
    best_model = clf.fit(X_train_scaled, y_train)

    # View best hyperparameters
    print(f'Best parameters {best_model.best_params_}')
    print(f'Train score is {best_model.best_score_}')

    # Best parameters {'C': 0.125, 'degree': 2, 'gamma': 0.07063223646433309, 'kernel': 'poly'}

    # Make a prediction on entire training set
    y_pred = best_model.best_estimator_.predict(X_test_scaled)

    # Classification report
    report = classification_report(y_true=y_test, y_pred=y_pred)
    print(report)

    # Confusion Matrix
    CM = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[0, 1])
    cm(cm=CM, target_names=['Burpee no jump', 'Burpee'], title='SVM CM')

    # ROC
    ROC(Model=best_model.best_estimator_, Y_test=y_test, X_test=X_test_scaled)
Esempio n. 4
0
def run_model():
    X_train_scaled, y_train, X_test_scaled, y_test = clean_data(scaled=1)

    # Create knn
    knn = KNeighborsClassifier()

    # Create n neighbors hyperparameter space
    n_neighbors = np.arange(start=1, stop=30, step=1)
    # Create hyperparameter options
    hyperparameters = dict(n_neighbors=n_neighbors)

    # Create grid search using 5 split stratisfied shuffle split cross validation
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
    clf = GridSearchCV(knn,
                       hyperparameters,
                       cv=cv,
                       verbose=0,
                       scoring='precision',
                       n_jobs=-1)

    # Fit grid search
    best_model = clf.fit(X_train_scaled, y_train)

    # View best hyperparameters
    print('Best N:', best_model.best_estimator_.get_params()['n_neighbors'])
    print(f'Train score is {best_model.best_score_}')

    cv_scores = best_model.cv_results_['mean_test_score']
    plt.plot(n_neighbors, cv_scores)
    plt.xlabel('K'), plt.ylabel('Mean Test Score - Precision')
    # Make a prediction on entire training set
    y_pred = best_model.best_estimator_.predict(X_test_scaled)

    # Classification Report
    report = classification_report(y_true=y_test, y_pred=y_pred)
    print(report)

    # Confusion Matrix
    CM = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[0, 1])
    cm(cm=CM, target_names=['Burpee no jump', 'Burpee'], title='KNN CM')

    # ROPC
    ROC(Model=best_model.best_estimator_, Y_test=y_test, X_test=X_test_scaled)
Esempio n. 5
0
def run_model():
    # Load data
    X_train_scaled, y_train, X_test_scaled, y_test = clean_data(scaled=1)
    # Create knn
    rf = RandomForestClassifier(random_state=42)

    # Hyperparameter space for RF
    # Number of trees in random forest
    n_estimators = n_estimators = np.arange(start=1, stop=30, step=1)
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]

    # Create the random grid
    random_grid = {
        'n_estimators': n_estimators,
        'max_features': max_features,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'bootstrap': bootstrap
    }

    # Create grid search using 5 split stratisfied shuffle split cross validation
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    clf = rf_random = RandomizedSearchCV(estimator=rf,
                                         param_distributions=random_grid,
                                         n_iter=100,
                                         cv=3,
                                         verbose=2,
                                         random_state=42,
                                         n_jobs=-1,
                                         scoring='precision')

    # Fit grid search
    best_model = clf.fit(X_train_scaled, y_train)

    # View best hyperparameters
    print(f'Best parameters {best_model.best_params_}')
    print(f'Train score is {best_model.best_score_}')
    # Best parameters {'n_estimators': 17, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
    # Train score is 0.9477959255356516

    # Make a prediction on entire training set
    y_pred = best_model.best_estimator_.predict(X_test_scaled)
    report = classification_report(y_true=y_test, y_pred=y_pred)
    print(report)

    # Confusion Matrix
    CM = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[0, 1])
    cm(cm=CM,
       target_names=['Burpee no jump', 'Burpee'],
       title='Random Forest CM')

    # ROC
    ROC(Model=best_model.best_estimator_, Y_test=y_test, X_test=X_test_scaled)