def train(self, dataset, classifiers): """A complete process of distinct steps in figuring out the best ML algorithm with best hyperparameters to polygon classification problem. """ pt = param_tuning.ParamTuning() f = Features() tot_time = time.time() start_time = time.time() data_df = pd.read_csv(dataset) ytrain = data_df['status'] Xtrain = data_df.drop('status', axis=1) print("Loaded train dataset in {} sec.".format(time.time() - start_time)) fX = f.build(Xtrain) print("Build features from train data in {} sec.".format(time.time() - start_time)) start_time = time.time() # 1st phase: find and fine tune the best classifier from a list of candidate ones best_clf = pt.fineTuneClassifiers(fX, ytrain, classifiers) estimator = best_clf['estimator'] print("Best hyperparams for {}, {}, with score {}; {} sec.".format( best_clf['hyperparams'], best_clf['clf_name'], best_clf['score'], time.time() - start_time)) estimator = pt.trainClassifier(fX, ytrain, estimator) os.makedirs(os.path.join(os.getcwd(), 'models'), exist_ok=True) dump( estimator, os.path.join(os.getcwd(), 'models', best_clf['clf_name'] + '_model.joblib')) print("The whole process took {} sec.".format(time.time() - tot_time))
def evaluate(self, dataset, classifier): """Evaluate the best ML algorithm with optimal hyperparameters to new unseen data. Parameters ---------- dataset : str Name of train dataset classifier : str Classifier to train and evaluate """ pt = param_tuning.ParamTuning() f = Features() tot_time = time.time() start_time = time.time() # Xtrain, Xtest, ytrain, ytest = self._load_and_split_data(dataset) data_df = pd.read_csv(dataset) ytest = data_df['status'] Xtest = data_df.drop('status', axis=1) print("Loaded test dataset in {} sec.".format(time.time() - start_time)) start_time = time.time() fX = f.build(Xtest) print("Build features from test data in {} sec".format(time.time() - start_time)) start_time = time.time() # 3th phase: test the fine tuned best classifier on the test dataset estimator = load( os.path.join(os.getcwd(), 'models', classifier + '_model.joblib')) res = pt.testClassifier(fX, ytest, estimator, True) self._print_stats(classifier, res['metrics'], res['feature_imp'], start_time) Xtest.reset_index(inplace=True) Xtest = pd.concat([ Xtest, pd.DataFrame(res['proba'], columns=['none_origin_pred', 'dian_origin_pred']) ], axis=1) os.makedirs('output', exist_ok=True) Xtest[[ 'pst_geom', 'dian_geom', 'none_origin_pred', 'dian_origin_pred' ]].to_csv(os.path.join('output', 'predictions.csv'), index=False) print("The whole process took {} sec.".format(time.time() - tot_time))
def hyperparamTuning(self, dataset, classifiers): """A complete process of distinct steps in figuring out the best ML algorithm with best hyperparameters to polygon classification problem. """ pt = param_tuning.ParamTuning() f = Features() tot_time = time.time() start_time = time.time() Xtrain, Xtest, ytrain, ytest = self._load_and_split_data(dataset) print("Loaded train/test datasets in {} sec.".format(time.time() - start_time)) fX = f.build(Xtrain) print("Build features from train data in {} sec.".format(time.time() - start_time)) start_time = time.time() # 1st phase: find and fine tune the best classifier from a list of candidate ones best_clf = pt.fineTuneClassifiers(fX, ytrain, classifiers) estimator = best_clf['estimator'] print("Best hyperparams, {}, with score {}; {} sec.".format( best_clf['hyperparams'], best_clf['score'], time.time() - start_time)) start_time = time.time() # 2nd phase: train the fine tuned best classifier on the whole train dataset (no folds) estimator = pt.trainClassifier(fX, ytrain, estimator) print( "Finished training model on dataset; {} sec.".format(time.time() - start_time)) start_time = time.time() fX = f.build(Xtest) print("Build features from test data in {} sec".format(time.time() - start_time)) start_time = time.time() # 3th phase: test the fine tuned best classifier on the test dataset res = pt.testClassifier(fX, ytest, estimator) self._print_stats(best_clf['clf_name'], res['metrics'], res['feature_imp'], start_time) print("The whole process took {} sec.".format(time.time() - tot_time))
def exec_classifiers(self, dataset): """Train and evaluate selected ML algorithms with custom hyper-parameters on dataset. """ f = Features() pt = param_tuning.ParamTuning() start_time = time.time() Xtrain, Xtest, ytrain, ytest = self._load_and_split_data(dataset) print("Loaded train/test datasets in {} sec.".format(time.time() - start_time)) fX_train = f.build(Xtrain) fX_test = f.build(Xtest) print("Build features from train/test data in {} sec".format( time.time() - start_time)) for clf in config.MLConf.clf_custom_params: print('Method {}'.format(clf)) print('=======', end='') print(len(clf) * '=') tot_time = time.time() start_time = time.time() # 1st phase: train each classifier on the whole train dataset (no folds) # estimator = pt.clf_names[clf][0](**config.MLConf.clf_custom_params[clf]) estimator = pt.clf_names[clf][0](random_state=config.seed_no) estimator.set_params(**config.MLConf.clf_custom_params[clf]) estimator = pt.trainClassifier(fX_train, ytrain, estimator) print("Finished training model on dataset; {} sec.".format( time.time() - start_time)) start_time = time.time() # 2nd phase: test each classifier on the test dataset res = pt.testClassifier(fX_test, ytest, estimator) self._print_stats(clf, res['metrics'], res['feature_imp'], start_time) # if not os.path.exists('output'): # os.makedirs('output') # np.savetxt(f'output/{clf}_default_stats.csv', res['metrics']['stats'], fmt="%u") print("The whole process took {} sec.\n".format(time.time() - tot_time))