def __init__(self, clf, scoring="roc_auc", verbose=0): """ Initializes the class. Args: clf (binary classifier): Model fitted on X_train. scoring (string or probatus.utils.Scorer, optional): Metric for which the model performance is calculated. It can be either a metric name aligned with predefined classification scorers names in sklearn ([link](https://scikit-learn.org/stable/modules/model_evaluation.html)). Another option is using probatus.utils.Scorer to define a custom metric. verbose (int, optional): Controls verbosity of the output: - 0 - nether prints nor warnings are shown - 1 - 50 - only most important warnings - 51 - 100 - shows other warnings and prints - above 100 - presents all prints and all warnings (including SHAP warnings). """ self.clf = clf self.scorer = get_single_scorer(scoring) self.verbose = verbose
def __init__( self, clf, scoring="roc_auc", test_prc=0.25, n_jobs=1, verbose=0, random_state=None, ): """ Initializes the class. Args: clf (model object): Binary classification model or pipeline. scoring (string or probatus.utils.Scorer, optional): Metric for which the model performance is calculated. It can be either a metric name aligned with predefined [classification scorers names in sklearn](https://scikit-learn.org/stable/modules/model_evaluation.html). Another option is using probatus.utils.Scorer to define a custom metric. The recommended option for this class is 'roc_auc'. test_prc (float, optional): Percentage of data used to test the model. By default 0.25 is set. n_jobs (int, optional): Number of parallel executions. If -1 use all available cores. By default 1. verbose (int, optional): Controls verbosity of the output: - 0 - neither prints nor warnings are shown - 1 - 50 - only most important warnings - 51 - 100 - shows other warnings and prints - above 100 - presents all prints and all warnings (including SHAP warnings). random_state (int, optional): Random state set at each round of feature elimination. If it is None, the results will not be reproducible and in random search at each iteration a different hyperparameters might be tested. For reproducible results set it to an integer. """ # noqa self.clf = clf self.test_prc = test_prc self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose self.scorer = get_single_scorer(scoring)
def __init__( self, clf, step=1, min_features_to_select=1, cv=None, scoring="roc_auc", n_jobs=-1, verbose=0, random_state=None, ): """ This method initializes the class. Args: clf (binary classifier, sklearn compatible search CV e.g. GridSearchCV, RandomizedSearchCV or BayesSearchCV): A model that will be optimized and trained at each round of feature elimination. The recommended model is [LGBMClassifier](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html), because it by default handles the missing values and categorical variables. This parameter also supports any hyperparameter search schema that is consistent with the sklearn API e.g. [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html), [RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) or [BayesSearchCV](https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV). step (int or float, optional): Number of lowest importance features removed each round. If it is an int, then each round such a number of features are discarded. If float, such a percentage of remaining features (rounded down) is removed each iteration. It is recommended to use float, since it is faster for a large number of features, and slows down and becomes more precise with fewer features. Note: the last round may remove fewer features in order to reach min_features_to_select. If columns_to_keep parameter is specified in the fit method, step is the number of features to remove after keeping those columns. min_features_to_select (int, optional): Minimum number of features to be kept. This is a stopping criterion of the feature elimination. By default the process stops when one feature is left. If columns_to_keep is specified in the fit method, it may overide this parameter to the maximum between length of columns_to_keep the two. cv (int, cross-validation generator or an iterable, optional): Determines the cross-validation splitting strategy. Compatible with sklearn [cv parameter](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html). If None, then cv of 5 is used. scoring (string or probatus.utils.Scorer, optional): Metric for which the model performance is calculated. It can be either a metric name aligned with predefined [classification scorers names in sklearn](https://scikit-learn.org/stable/modules/model_evaluation.html). Another option is using probatus.utils.Scorer to define a custom metric. n_jobs (int, optional): Number of cores to run in parallel while fitting across folds. None means 1 unless in a `joblib.parallel_backend` context. -1 means using all processors. verbose (int, optional): Controls verbosity of the output: - 0 - neither prints nor warnings are shown - 1 - 50 - only most important warnings - 51 - 100 - shows other warnings and prints - above 100 - presents all prints and all warnings (including SHAP warnings). random_state (int, optional): Random state set at each round of feature elimination. If it is None, the results will not be reproducible and in random search at each iteration a different hyperparameters might be tested. For reproducible results set it to an integer. """ # noqa self.clf = clf if isinstance(self.clf, BaseSearchCV): self.search_clf = True else: self.search_clf = False if (isinstance(step, int) or isinstance(step, float)) and step > 0: self.step = step else: raise (ValueError( f"The current value of step = {step} is not allowed. " f"It needs to be a positive integer or positive float.")) if isinstance(min_features_to_select, int) and min_features_to_select > 0: self.min_features_to_select = min_features_to_select else: raise (ValueError( f"The current value of min_features_to_select = {min_features_to_select} is not allowed. " f"It needs to be a greater than or equal to 0.")) self.cv = cv self.scorer = get_single_scorer(scoring) self.random_state = random_state self.n_jobs = n_jobs self.report_df = pd.DataFrame([]) self.verbose = verbose
def __init__( self, clf, strategies, scoring="roc_auc", cv=5, model_na_support=False, n_jobs=-1, verbose=0, random_state=None, ): """ Initialise the class. Args : clf (binary classifier,sklearn.Pipeline): A binary classification model, that will used to evaluate various imputation strategies. strategies (dictionary of sklearn.impute objects or any other scikit learn compatible imputer.): Dictionary containing the sklearn.impute objects. e.g. strategies = {'KNN' : KNNImputer(n_neighbors=3), 'Simple Median Imputer' : SimpleImputer(strategy='median',add_indicator=True), 'Iterative Imputer' : IterativeImputer(add_indicator=True,n_nearest_features=5, sample_posterior=True)} This allows you to have fine grained control over the imputation method. scoring (string, list of strings, probatus.utils.Scorer or list of probatus.utils.Scorers, optional): Metrics for which the score is calculated. It can be either a name or list of names metric names and needs to be aligned with predefined [classification scorers names in sklearn](https://scikit-learn.org/stable/modules/model_evaluation.html). Another option is using probatus.utils.Scorer to define a custom metric. model_na_support(boolean): default False If the classifier supports missing values by default e.g. LightGBM,XGBoost etc. If True an default comparison `No Imputation` result will be added indicating the model performance without any explict imputation. If False only the provided strategies will be used. n_jobs (int, optional): Number of cores to run in parallel while fitting across folds. None means 1 unless in a `joblib.parallel_backend` context. -1 means using all processors. verbose (int, optional): Controls verbosity of the output: - 0 - nether prints nor warnings are shown - 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings) - 51 - 100 - shows most important warnings, prints of the feature removal process - above 100 - presents all prints and all warnings (including SHAP warnings). random_state (int, optional): Random state set at each round of feature elimination. If it is None, the results will not be reproducible and in random search at each iteration a different hyperparameters might be tested. For reproducible results set it to integer. """ # noqa self.clf = clf self.model_na_support = model_na_support self.cv = cv self.scorer = get_single_scorer(scoring) self.strategies = strategies self.verbose = verbose self.n_jobs = n_jobs self.random_state = random_state self.fitted = False self.report_df = pd.DataFrame([])