def run_model(): # Load data X_train_scaled, y_train, X_test_scaled, y_test = clean_data(scaled=1) # Build the naive bayes models gaussianNB = GaussianNB() bernoulliNB = BernoulliNB() multinomialNB = MultinomialNB() for nb in [gaussianNB, bernoulliNB, multinomialNB]: print(f'Working {str(nb)}\n') nb.fit(X_train_scaled, y_train) y_pred = nb.predict(X_test_scaled) report = classification_report(y_true=y_test, y_pred=y_pred) print(report) print(confusion_matrix(y_true=y_test, y_pred=y_pred)) # Confusion matrix CM = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[0, 1]) cm(cm=CM, target_names=['Burpee no jump', 'Burpee'], title=str(nb) + ' CM') # ROC ROC(Model=nb, Y_test=y_test, X_test=X_test_scaled)
def run_model(): # Load data X_train_scaled, y_train, X_test_scaled, y_test = clean_data(scaled=1) # Create logistic regresion logistic = LogisticRegression(solver='liblinear', max_iter=150, random_state=42) # Create regularization penalty space penalty = ['l1', 'l2'] # Create regularization hyperparameter space # First run in logspace found C to be ~10 C = np.linspace(8, 12, 30) # Create hyperparameter options hyperparameters = dict(C=C, penalty=penalty) # Create grid search using 5 split stratisfied shuffle split cross validation cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) clf = GridSearchCV(logistic, hyperparameters, cv=cv, verbose=0, scoring='precision', n_jobs=-1) # Fit grid search best_model = clf.fit(X_train_scaled, y_train) # View best hyperparameters print('Best Penalty:', best_model.best_estimator_.get_params()['penalty']) print('Best C:', best_model.best_estimator_.get_params()['C']) print(f'Best training score {best_model.best_score_}') # Make a prediction on entire training set y_pred = best_model.best_estimator_.predict(X_test_scaled) # Classification report showing precision, report = classification_report(y_true=y_test, y_pred=y_pred) print(report) # Display confusion matrix CM = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[0, 1]) cm(cm=CM, target_names=['Burpee no jump', 'Burpee'], title='Logistric Regression CM') # Plot a bar graph of the variables to get insight in importance bar_coef(Model=best_model, X_train_scaled=X_test_scaled) # Plot ROC curve ROC(Model=best_model.best_estimator_, Y_test=y_test, X_test=X_test_scaled)
def run_model(): # Load data X_train_scaled, y_train, X_test_scaled, y_test = clean_data(scaled=1) # Create SVC svc = SVC(max_iter=-1, probability=True) # Create diff kernels kernel = ['rbf', 'linear', 'poly', 'sigmoid'] # Degree for poly kernels degree = np.arange(start=1, stop=5, step=1) # Gamma for poly kernels gamma = np.logspace(start=-15, stop=4, num=18, base=2) # C penalty factor C = np.logspace(start=-3, stop=16, num=18, base=2) # Create hyperparameter options hyperparameters = dict(kernel=kernel, degree=degree, gamma=gamma, C=C) # Create grid search using 5 split stratisfied shuffle split cross validation cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42) clf = GridSearchCV(svc, hyperparameters, cv=cv, verbose=1, scoring='precision', n_jobs=-1) # Fit grid search best_model = clf.fit(X_train_scaled, y_train) # View best hyperparameters print(f'Best parameters {best_model.best_params_}') print(f'Train score is {best_model.best_score_}') # Best parameters {'C': 0.125, 'degree': 2, 'gamma': 0.07063223646433309, 'kernel': 'poly'} # Make a prediction on entire training set y_pred = best_model.best_estimator_.predict(X_test_scaled) # Classification report report = classification_report(y_true=y_test, y_pred=y_pred) print(report) # Confusion Matrix CM = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[0, 1]) cm(cm=CM, target_names=['Burpee no jump', 'Burpee'], title='SVM CM') # ROC ROC(Model=best_model.best_estimator_, Y_test=y_test, X_test=X_test_scaled)
def run_model(): X_train_scaled, y_train, X_test_scaled, y_test = clean_data(scaled=1) # Create knn knn = KNeighborsClassifier() # Create n neighbors hyperparameter space n_neighbors = np.arange(start=1, stop=30, step=1) # Create hyperparameter options hyperparameters = dict(n_neighbors=n_neighbors) # Create grid search using 5 split stratisfied shuffle split cross validation cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42) clf = GridSearchCV(knn, hyperparameters, cv=cv, verbose=0, scoring='precision', n_jobs=-1) # Fit grid search best_model = clf.fit(X_train_scaled, y_train) # View best hyperparameters print('Best N:', best_model.best_estimator_.get_params()['n_neighbors']) print(f'Train score is {best_model.best_score_}') cv_scores = best_model.cv_results_['mean_test_score'] plt.plot(n_neighbors, cv_scores) plt.xlabel('K'), plt.ylabel('Mean Test Score - Precision') # Make a prediction on entire training set y_pred = best_model.best_estimator_.predict(X_test_scaled) # Classification Report report = classification_report(y_true=y_test, y_pred=y_pred) print(report) # Confusion Matrix CM = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[0, 1]) cm(cm=CM, target_names=['Burpee no jump', 'Burpee'], title='KNN CM') # ROPC ROC(Model=best_model.best_estimator_, Y_test=y_test, X_test=X_test_scaled)
def run_model(): # Load data X_train_scaled, y_train, X_test_scaled, y_test = clean_data(scaled=1) # Create knn rf = RandomForestClassifier(random_state=42) # Hyperparameter space for RF # Number of trees in random forest n_estimators = n_estimators = np.arange(start=1, stop=30, step=1) # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree bootstrap = [True, False] # Create the random grid random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap } # Create grid search using 5 split stratisfied shuffle split cross validation cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) clf = rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1, scoring='precision') # Fit grid search best_model = clf.fit(X_train_scaled, y_train) # View best hyperparameters print(f'Best parameters {best_model.best_params_}') print(f'Train score is {best_model.best_score_}') # Best parameters {'n_estimators': 17, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False} # Train score is 0.9477959255356516 # Make a prediction on entire training set y_pred = best_model.best_estimator_.predict(X_test_scaled) report = classification_report(y_true=y_test, y_pred=y_pred) print(report) # Confusion Matrix CM = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[0, 1]) cm(cm=CM, target_names=['Burpee no jump', 'Burpee'], title='Random Forest CM') # ROC ROC(Model=best_model.best_estimator_, Y_test=y_test, X_test=X_test_scaled)