def FeatureSelection(pipeline_name, data_dev_mode, tag, train_filepath, test_filepath): logger.info('FEATURE SELECTION...') if bool(config.params.clean_experiment_directory_before_training ) and os.path.isdir(config.params.experiment_dir): logger.info('Cleaning experiment directory...') shutil.rmtree(config.params.experiment_dir) data = _read_data(data_dev_mode, train_filepath, test_filepath) train_set = data['train'] y = train_set[config.TARGET_COL].values.reshape(-1, ) train_set = train_set.drop(columns=config.TARGET_COL) pipeline = PIPELINES[pipeline_name](so_config=config.SOLUTION_CONFIG, suffix=tag) sfs = SequentialFeatureSelector(estimator=pipeline, k_features=(10, len(train_set.columns)), forward=False, verbose=2, cv=5, scoring='roc_auc') sfs.fit(train_set.to_numpy(), y) fig = plot_sequential_feature_selection(sfs.get_metric_dict()) plt.ylim([0.6, 1]) plt.title('Sequential Feature Selection') plt.grid() plt.show()
def test_model(model: sk.base.BaseEstimator, x_train: pd.DataFrame, y_train: pd.DataFrame, x_test: pd.DataFrame, y_test: pd.DataFrame, title: str, emotion: str, width: int, location: str, out: str = 'models', n_jobs: int = 1, cv: Optional[int] = None) \ -> Dict[str, Union[str, float, int]]: y_train_target = y_train[f"middle.emotions.{emotion}"] if emotion != 'all' \ else y_train y_test_target = y_test[f"middle.emotions.{emotion}"] if emotion != 'all' \ else y_test out_path = pathlib.Path(out) \ / f"w{width}/{location}" / f"{title.lower().replace(' ', '-')}" out_path.mkdir(parents=True, exist_ok=True) report = {} logger.info("Analyzing %s on %s", title, emotion) report['model'] = title report['target'] = emotion report['width'] = width report['location'] = location backup_model = sk.base.clone(model) try: features = analyze_model(model, x_train, y_train_target, n_jobs=n_jobs) except BaseException as e: import textwrap logger.error("There was an error of type %s", str(type(e))) logger.error("Error message: %s", str(e)) with open(out_path / f"no-{emotion}.txt", 'w', encoding='utf-8') as file: file.write("Error in generating the model.\n\n") file.write("Error message\n") file.write("-------------\n\n") file.write(textwrap.fill(str(e), 80)) return report plot_sequential_feature_selection(features.get_metric_dict(), kind='std_dev') logger.info("Saving feature selection diagram") plt.title( f'{title} on {emotion} (w/StdDev, width: {width}, location: {location})' ) plt.grid() plt.savefig(out_path / f"{emotion}.svg") if cv is not None: logger.info("Cross validating model") scores = cross_validate( model, features.transform(x_train), y_train_target, cv=cv, n_jobs=n_jobs, ) logger.info("Saving cross validation results to a CSV") pd.DataFrame(scores).to_csv(out_path / f'{emotion}-cv.csv', encoding='utf-8') logger.info("Training final model") start_time = time.time() backup_model.fit(features.transform(x_train), y_train_target) end_time = time.time() report['training_time'] = end_time - start_time logger.info("Training completed in %.3f seconds", report['training_time']) try: logger.info("Testing final model") y_pred = backup_model.predict(features.transform(x_test)) report['test_accuracy'] = sk.metrics.accuracy_score( y_test_target, y_pred) cm = sk.metrics.classification_report( y_test_target, y_pred, # target_names=(y_test.columns if emotion == 'all' else range(7)) ) with open(out_path / f"{emotion}-report.txt", 'w', encoding='utf-8') as file: logger.info("Saving report to file") file.write(cm) except BaseException as e: logger.error( "There was a %s in the testing phase. Testing phase skipped." "\nError message: %s", str(type(e)), str(e)) logger.info("Saving final model to file...") joblib.dump(backup_model, out_path / f"{emotion}.joblib") report['n_features'] = len(features.k_feature_names_) report['features'] = features.k_feature_names_ report['score'] = features.k_score_ return report
def step_forward(X, y, name): print(name) # Inspiration: https://www.kdnuggets.com/2018/06/step-forward-feature-selection-python.html # Set up training/testing standardized data X_train_std, X_test_std, y_train, y_test = tts_std(X, y) # Build RF classifier to use in feature selection: liblinear solver recommended when you have # high dimension dataset, but once you standardize your data, the accuracy of all solvers is # pretty much the same. max_iter (maximum number of iterations taken for the solvers to converge.) # is set to a higher number than default (100) so that the model will actually converge (lower # values cause a no convergence warning). clf = skllm.LogisticRegression(penalty='l1', C=0.1, solver='liblinear', max_iter=100) # Build step forward feature selection: cv (cross validation) is set to zero for no # cross validation, k_features = 3 means we are selecting the 3 best attributes to desribe # our feature, and verbose is just used for logging the progress of the feature selector sfs1 = mlx.SequentialFeatureSelector(clf, k_features=5, forward=True, floating=False, verbose=0, scoring='accuracy', cv=10) # Perform SFS sfs1 = sfs1.fit(X_train_std, y_train, custom_feature_names=X.columns) # Which features? print('\t' + 'Top 5 features: ' + str(sfs1.k_feature_names_)) feat_cols1 = list(sfs1.k_feature_idx_) # Build full model with selected features: sfs has no predict function clf = skllm.LogisticRegression(penalty='l1', C=0.1, solver='liblinear', max_iter=100) # Now that we have the relevant features according to SFS, we can use logistic regression # on JUST those features and see how accurately they can predict the classification of # single loaded, clear, straight, etc. clf.fit(X_train_std[:, feat_cols1], y_train) # 'kind' represents the kind of error bar you get in your plot {'std_dev', 'std_err', 'ci', # None}. This error bar is the error of the cv scores. fig1 = mlxp.plot_sequential_feature_selection(sfs1.get_metric_dict(), kind='std_dev') plt.title('Sequential Forward Feature Selection CV Scores: ' + name + ' (std dev)') plt.ylabel('Mean CV Score') plt.grid() plt.savefig('feature_selection/sfs_' + name + ".png") plt.close() # Accuracy y_train_pred = clf.predict(X_train_std[:, feat_cols1]) print('\tTraining accuracy on selected features: %.3f' % sklm.accuracy_score(y_train, y_train_pred)) print('\tTraining mean absolute error on selected features: %.3f' % mean_abs_error(y_train, y_train_pred)) y_test_pred = clf.predict(X_test_std[:, feat_cols1]) print('\tTesting accuracy on selected features: %.3f' % sklm.accuracy_score(y_test, y_test_pred)) print('\tTesting mean_abs_error on selected features: %.3f' % mean_abs_error(y_test, y_test_pred)) # Confusion matrix generation confusion_matrix(y_train, y_train_pred, name + "_sfs_Training_Data_") confusion_matrix(y_test, y_test_pred, name + "_sfs_Testing_Data_") my_auc(y_train, X_train_std[:, feat_cols1], name + '_sfs_training', sfs1.k_feature_names_) # CV scores scores = sklms.cross_val_score(clf, X, y, cv=4) print('\t' + name + ' CVs: ' + str(scores)) return sfs1, clf, pd.DataFrame.from_dict(sfs1.get_metric_dict()).T