def learn_decision_tree(stratified_data_csv_file, save_filepath): # read the stratified dataset data = np.genfromtxt(stratified_data_csv_file, delimiter = ',', skip_header = 1) X, y = data[:, :-1], data[:, -1] # do a 70-30 train-test split. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 10) # testing parameters params = { 'max_depth': [None, 5, 10, 20, 30], 'min_samples_split': [2, 5, 10] } stratified_k_fold = StratifiedKFold(n_splits = 10) classifier = GridSearchCV(DecisionTreeClassifier(), params, cv = stratified_k_fold, verbose = 5) classifier.fit(X_train, y_train) best_classifier = classifier.best_estimator_ y_pred = best_classifier.predict(X_test) # model statistics print('Decision Trees Model Statistics') print('Best params: {0}'.format(classifier.best_params_)) model_stats.compute_basic_stats(y_test, y_pred) model_stats.compute_roc_score(y_test, y_pred) model_stats.plot_normalized_confusion_matrix( y_test, y_pred, 'Decision Trees Classifier Normalized Confusion Matrix' ) # fit the classifier on the complete dataset once we get best parameters best_classifier = DecisionTreeClassifier(**classifier.best_params_) best_classifier.fit(X, y) # save the model save_model(best_classifier, save_filepath)
def learn_sgd(stratified_data_csv_file, save_filepath): # read the stratified dataset data = np.genfromtxt(stratified_data_csv_file, delimiter=',', skip_header=1) X, y = data[:, :-1], data[:, -1] # do a 70-30 train-test split. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10) # standardize train and test data (scale X_train to [0, 1] range) scaler = StandardScaler().fit(X_train) X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) # testing parameters params = { 'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], 'penalty': ['l1', 'l2', 'elasticnet'], 'alpha': [1e-5, 1e-4, 1e-3], 'max_iter': [200] } stratified_k_fold = StratifiedKFold(n_splits=10) classifier = GridSearchCV(SGDClassifier(), params, cv=stratified_k_fold, verbose=5) classifier.fit(X_train, y_train) best_classifier = classifier.best_estimator_ y_pred = best_classifier.predict(X_test) # model statistics print('SGD Model Statistics') print('Best params: {}'.format(classifier.best_params_)) model_stats.compute_basic_stats(y_test, y_pred) model_stats.compute_roc_score(y_test, y_pred) model_stats.plot_normalized_confusion_matrix( y_test, y_pred, 'SGD Classifier Normalized Confusion Matrix') # fit the classifier on the complete dataset once we get best parameters best_classifier = SGDClassifier(**classifier.best_params_) best_classifier.fit(X, y) # save the model save_model(best_classifier, save_filepath)
def learn_bagging(stratified_data_csv_file, save_filepath): # read the stratified dataset data = np.genfromtxt(stratified_data_csv_file, delimiter=',', skip_header=1) X, y = data[:, :-1], data[:, -1] # do a 70-30 train-test split. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10) ############ # testing parameters params = { 'base_estimator': [None, KNeighborsClassifier()], 'n_estimators': [10, 20, 30], 'max_samples': [0.50, 0.75], 'max_features': [0.50, 0.75] } stratified_k_fold = StratifiedKFold(n_splits=10) classifier = GridSearchCV(BaggingClassifier(), params, cv=stratified_k_fold, verbose=5, n_jobs=3) classifier.fit(X_train, y_train) best_classifier = classifier.best_estimator_ y_pred = best_classifier.predict(X_test) print('Bagging Classifier Statistics') print('Best params: {}'.format(classifier.best_params_)) model_stats.compute_basic_stats(y_test, y_pred) model_stats.compute_roc_score(y_test, y_pred) model_stats.plot_normalized_confusion_matrix( y_test, y_pred, 'Bagging Classifier Normalized Confusion Matrix') # fit the classifier on the complete dataset once we get best parameters best_classifier = BaggingClassifier(**classifier.best_params_) best_classifier.fit(X, y) # save the model save_model(best_classifier, save_filepath)
def learn_rbf_svm(stratified_data_csv_file, save_filepath): # read the stratified dataset data = np.genfromtxt(stratified_data_csv_file, delimiter=',', skip_header=1) X, y = data[:, :-1], data[:, -1] # do a 70-30 train-test split. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10) # standardize train and test data (scale X_train to [0, 1] range) scaler = StandardScaler().fit(X_train) X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) # testing parameters params = { 'kernel': ['rbf'], 'C': np.logspace(-2, 2, 5), 'gamma': np.logspace(-2, 2, 5) } stratified_k_fold = StratifiedKFold(n_splits=10) classifier = GridSearchCV(SVC(), params, cv=stratified_k_fold, verbose=5) classifier.fit(X_train, y_train) best_classifier = classifier.best_estimator_ y_pred = best_classifier.predict(X_test) # model statistics print('RBF Kernel SVM Model Statistics') print('Best params: {}'.format(classifier.best_params_)) model_stats.compute_basic_stats(y_test, y_pred) model_stats.compute_roc_score(y_test, y_pred) model_stats.plot_normalized_confusion_matrix( y_test, y_pred, 'RBF Kernel SVM Classifier Normalized Confusion Matrix') # fit the classifier on the complete dataset once we get best parameters best_classifier = SVC(**classifier.best_params_) best_classifier.fit(X, y) # save the model save_model(best_classifier, save_filepath)
def learn_voting_classifier(stratified_data_csv_file, save_filepath): # read the stratified dataset data = np.genfromtxt(stratified_data_csv_file, delimiter=',', skip_header=1) X, y = data[:, :-1], data[:, -1] # do a 70-30 train-test split. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10) # classifier to test classifiers = [ # ('dt', DecisionTreeClassifier(max_depth = None, min_samples_split = 2)), ('knn', KNeighborsClassifier(n_neighbors=5)), # ('lin_svm', SVC(C = 100.0, kernel = 'linear')), # ('logreg', LogisticRegression(C = '100.0', max_iter = '200', penalty = 'l1')), ('nb', GaussianNB()), # ('rbf_svm', SVC(C = 100.0, gamma = 0.1, kernel = 'rbf')), ('rf', RandomForestClassifier(max_depth=20, min_samples_split=5, n_estimators=30)), ('sgd', SGDClassifier(alpha=0.0001, loss='log', max_iter=200, penalty='l2')), ('bagging', BaggingClassifier(base_estimator=None, max_features=0.75, max_samples=0.5, n_estimators=30)), # ('boosting', GradientBoostingClassifier( # learning_rate = 0.1, max_depth = 5, min_samples_split = 5, n_estimators = 300 # )) ] # create all possible combinations combinations_ = list() for i in range(len(classifiers)): combinations_.extend(list(itertools.combinations(classifiers, i + 1))) # testing parameters params = { 'estimators': combinations_, 'voting': [ 'soft', 'hard', ], } stratified_k_fold = StratifiedKFold(n_splits=10) classifier = RandomizedSearchCV(VotingClassifier(estimators=None), params, cv=stratified_k_fold, verbose=5, n_jobs=3) classifier.fit(X_train, y_train) best_classifier = classifier.best_estimator_ y_pred = best_classifier.predict(X_test) # model statistics print('Voting Classifier Statistics') print('Best params: {}'.format(classifier.best_params_)) model_stats.compute_basic_stats(y_test, y_pred) model_stats.compute_roc_score(y_test, y_pred) model_stats.plot_normalized_confusion_matrix( y_test, y_pred, 'Voting Classifier Normalized Confusion Matrix') # fit the classifier on the complete dataset once we get best parameters best_classifier = VotingClassifier(**classifier.best_params_) best_classifier.fit(X, y) # save the model save_model(best_classifier, save_filepath)
def learn(training_data_infile, trained_model_outfile=None, display_metrics: bool = False, gs_verbose: int = 0, n_jobs=1): """ Trains a voting classifier :param training_data_infile: Csv file containing training data (labeled) • The last column should be training labels • Csv file can contain header (line 1 is skipped) • Use: machine_learning.aux.data_processing.create_training_dataset :param trained_model_outfile: where to save the model :param display_metrics: whether to print model metrics or not :param gs_verbose: verbosity of GridSearch :param n_jobs: GridSearch parallel jobs :return: """ training_data_infile = os.path.abspath(training_data_infile) # start print('-' * 25) print('Starting learning for `Voting Classifier`') print('training_infile: {:s}'.format( str(os.path.relpath(training_data_infile)))) print('trained_outfile: {:s}'.format( str(os.path.relpath(trained_model_outfile) ) if trained_model_outfile is not None else 'None')) print('display_metric: {:s}, gs_verbose: {:d}, n_jobs: {:d}'.format( str(display_metrics), gs_verbose, n_jobs)) print() # read the stratified dataset data = np.genfromtxt(training_data_infile, delimiter=',', skip_header=1) features_x, target_y = data[:, :-1], data[:, -1] # do a 70-30 train-test split. x_train, x_test, y_train, y_test = train_test_split(features_x, target_y, test_size=0.30) scaler = StandardScaler() features_x = scaler.fit_transform(features_x) x_train = scaler.fit_transform(x_train) x_test = scaler.fit_transform(x_test) # classifier to test classifiers = [ # ('dt', DecisionTreeClassifier(max_depth = None, min_samples_split = 2)), ('knn', KNeighborsClassifier(n_neighbors=5)), # ('lin_svm', SVC(C = 100.0, kernel = 'linear')), # ('logreg', LogisticRegression(C = '100.0', max_iter = '200', penalty = 'l1')), ('nb', GaussianNB()), # ('rbf_svm', SVC(C = 100.0, gamma = 0.1, kernel = 'rbf')), ('rf', RandomForestClassifier(max_depth=20, min_samples_split=5, n_estimators=30)), ('sgd', SGDClassifier(alpha=0.0001, loss='log', max_iter=200, penalty='l2')), ('bagging', BaggingClassifier(base_estimator=None, max_features=0.75, max_samples=0.5, n_estimators=30)), # ('boosting', GradientBoostingClassifier( # learning_rate = 0.1, max_depth = 5, min_samples_split = 5, n_estimators = 300 # )) ] # create all possible combinations combinations_ = list() for i in range(len(classifiers)): combinations_.extend(list(itertools.combinations(classifiers, i + 1))) # testing parameters params = { 'estimators': combinations_, 'voting': [ 'soft', 'hard', ], } stratified_k_fold = StratifiedKFold(n_splits=10) classifier = RandomizedSearchCV(VotingClassifier(estimators=None), params, cv=stratified_k_fold, scoring={ 'accuracy': make_scorer(accuracy_score), 'precision': make_scorer(precision_score), 'recall': make_scorer(recall_score), 'roc_auc': make_scorer(roc_auc_score), 'f1': make_scorer(f1_score), }, refit='f1', verbose=gs_verbose, n_jobs=n_jobs) classifier.fit(x_train, y_train) best_classifier = classifier.best_estimator_ y_pred = best_classifier.predict(x_test) print('Voting Classifier Statistics') print('Best params: {}'.format(classifier.best_params_)) model_stats.compute_basic_stats(y_test, y_pred) model_stats.compute_roc_score(y_test, y_pred) model_stats.plot_normalized_confusion_matrix( y_test, y_pred, 'Voting Classifier Normalized Confusion Matrix') # fit the classifier on the complete dataset once we get best parameters complete_classifier = VotingClassifier(**classifier.best_params_) complete_classifier.fit(features_x, target_y) # save the model if trained_model_outfile: try: trained_model_outfile = os.path.abspath(trained_model_outfile) save_model(complete_classifier, trained_model_outfile) print('Classifier successfully saved at: {:s}'.format( str(os.path.relpath(trained_model_outfile)))) except Exception as exc: print('Error while saving model! Could not save at: ' '{:s}'.format(str(os.path.relpath(trained_model_outfile)))) print(exc) print('-' * 25) return complete_classifier