def save_config(file_name: str, globals_d: dict): """ This function is used to save all enviroment variables to file, allowing to later resume modeling without rerunning setup(). Example ------- >>> save_config('myvars.pkl') This will save all enviroment variables to 'myvars.pkl'. """ function_params_str = ", ".join( [f"{k}={v}" for k, v in locals().items() if not k == "globals_d"]) logger = get_logger() logger.info("Initializing save_config()") logger.info(f"save_config({function_params_str})") globals_to_dump = { k: v for k, v in globals_d.items() if k in globals_d["pycaret_globals"] } import joblib joblib.dump(globals_to_dump, file_name) logger.info(f"Global variables dumped to {file_name}") logger.info( "save_config() succesfully completed......................................" )
def __init__( self, estimator, X_train: pd.DataFrame, y_train: pd.DataFrame, groups=None, **fit_kwargs, ): logger = get_logger() self.estimator = deepcopy(estimator) if not is_fitted(self.estimator): try: self.estimator._carry_over_final_estimator_fit_vars() except: pass if not is_fitted(self.estimator): logger.info( f"fit_if_not_fitted: {estimator} is not fitted, fitting") try: self.estimator.fit(X_train, y_train, groups=groups, **fit_kwargs) except: self.estimator.fit(X_train, y_train, **fit_kwargs)
def can_early_stop( estimator, consider_partial_fit, consider_warm_start, consider_xgboost, params, ): """ From https://github.com/ray-project/tune-sklearn/blob/master/tune_sklearn/tune_basesearch.py. Helper method to determine if it is possible to do early stopping. Only sklearn estimators with ``partial_fit`` or ``warm_start`` can be early stopped. warm_start works by picking up training from the previous call to ``fit``. Returns ------- bool if the estimator can early stop """ logger = get_logger() from sklearn.tree import BaseDecisionTree from sklearn.ensemble import BaseEnsemble try: base_estimator = estimator.steps[-1][1] except: base_estimator = estimator if consider_partial_fit: can_partial_fit = supports_partial_fit(base_estimator, params=params) else: can_partial_fit = False if consider_warm_start: is_not_tree_subclass = not issubclass(type(base_estimator), BaseDecisionTree) is_ensemble_subclass = issubclass(type(base_estimator), BaseEnsemble) can_warm_start = hasattr(base_estimator, "warm_start") and ( (hasattr(base_estimator, "max_iter") and is_not_tree_subclass and not is_ensemble_subclass) or (is_ensemble_subclass and hasattr(base_estimator, "n_estimators"))) else: can_warm_start = False if consider_xgboost: from xgboost.sklearn import XGBModel is_xgboost = isinstance(base_estimator, XGBModel) else: is_xgboost = False logger.info( f"can_partial_fit: {can_partial_fit}, can_warm_start: {can_warm_start}, is_xgboost: {is_xgboost}" ) return can_partial_fit or can_warm_start or is_xgboost
def load_config(file_name: str, globals_d: dict): """ This function is used to load enviroment variables from file created with save_config(), allowing to later resume modeling without rerunning setup(). Example ------- >>> load_config('myvars.pkl') This will load all enviroment variables from 'myvars.pkl'. """ function_params_str = ", ".join( [f"{k}={v}" for k, v in locals().items() if not k == "globals_d"] ) logger = get_logger() logger.info("Initializing load_config()") logger.info(f"load_config({function_params_str})") import joblib loaded_globals = joblib.load(file_name) logger.info(f"Global variables loaded from {file_name}") for k, v in loaded_globals.items(): globals_d[k] = v globals_d["logger"] = get_logger() logger.info(f"Global variables set to match those in {file_name}") logger.info( "load_config() succesfully completed......................................" )
def get_groups(groups: Union[str, pd.DataFrame], X_train: pd.DataFrame, default: pd.DataFrame): logger = get_logger() if groups is None: return default if isinstance(groups, str): if groups not in X_train.columns: raise ValueError( f"Column {groups} used for groups is not present in the dataset." ) groups = X_train[groups] else: if groups.shape[0] != X_train.shape[0]: raise ValueError( f"groups has lenght {groups.shape[0]} which doesn't match X_train length of {len(X_train)}." ) return groups
def set_config(variable: str, value, globals_d: dict): """ This function is used to reset global environment variables. Example ------- >>> set_config('seed', 123) This will set the global seed to '123'. """ function_params_str = ", ".join( [f"{k}={v}" for k, v in locals().items() if not k == "globals_d"] ) logger = get_logger() logger.info("Initializing set_config()") logger.info(f"set_config({function_params_str})") if variable.startswith("_"): raise ValueError(f"Variable {variable} is read only ('_' prefix).") if not variable in globals_d["pycaret_globals"] or variable == "pycaret_globals": raise ValueError( f"Variable {variable} not found. Possible variables are: {globals_d['pycaret_globals']}" ) globals_d[variable] = value # special case if not globals_d["gpu_param"] and variable == "n_jobs_param": globals_d["_gpu_n_jobs_param"] = value logger.info(f"Global variable: {variable} updated to {value}") logger.info( "set_config() succesfully completed......................................" )
def get_config(variable: str, globals_d: dict): """ This function is used to access global environment variables. Example ------- >>> X_train = get_config('X_train') This will return X_train transformed dataset. Returns ------- variable """ function_params_str = ", ".join( [f"{k}={v}" for k, v in locals().items() if not k == "globals_d"] ) logger = get_logger() logger.info("Initializing get_config()") logger.info(f"get_config({function_params_str})") if not variable in globals_d["pycaret_globals"]: raise ValueError( f"Variable {variable} not found. Possible variables are: {globals_d['pycaret_globals']}" ) global_var = globals_d[variable] logger.info(f"Global variable: {variable} returned as {global_var}") logger.info( "get_config() succesfully completed......................................" ) return global_var
def __init__( self, verbose: bool = True, html_param: bool = True, progress_args: Optional[Dict[str, Any]] = None, master_display_columns: Optional[List[str]] = None, monitor_rows: Optional[List[List[str]]] = None, round: int = 4, ): self.logger = get_logger() self.verbose = verbose self.html_param = html_param self.round = round try: self.enviroment = str(get_ipython()) self.enviroment = "google.colab" if is_in_colab( ) else self.enviroment except: self.enviroment = "" if not self.verbose: return self.logger.info("Preparing display monitor") # progress bar if progress_args and self.verbose and self.html_param: progress_args = {**self.default_progress_args, **progress_args} self.progress = ipw.IntProgress(**progress_args) if master_display_columns: self.master_display = pd.DataFrame(columns=master_display_columns) if monitor_rows and self.html_param: self.monitor = pd.DataFrame( monitor_rows, columns=[" " * i for i in range(len(monitor_rows[0]))], ).set_index("")
def show_yellowbrick_plot( visualizer, X_train, y_train, X_test, y_test, name: str, handle_train: str = "fit", handle_test: str = "score", scale: float = 1, save: bool = False, fit_kwargs: Optional[dict] = None, groups: Optional[Any] = None, display: Optional[Display] = None, display_format: Optional[str] = None, **kwargs, ): """ Generic method to handle yellowbrick plots. """ logger = get_logger() visualizer.fig.set_dpi(visualizer.fig.dpi * scale) if not fit_kwargs: fit_kwargs = {} fit_kwargs_and_kwargs = {**fit_kwargs, **kwargs} if handle_train == "draw": logger.info("Drawing Model") visualizer.draw(X_train, y_train, **kwargs) elif handle_train == "fit": logger.info("Fitting Model") visualizer.fit(X_train, y_train, **fit_kwargs_and_kwargs) elif handle_train == "fit_transform": logger.info("Fitting & Transforming Model") visualizer.fit_transform(X_train, y_train, **fit_kwargs_and_kwargs) elif handle_train == "score": logger.info("Scoring train set") visualizer.score(X_train, y_train, **kwargs) display.move_progress() if handle_test == "draw": visualizer.draw(X_test, y_test) elif handle_test == "fit": visualizer.fit(X_test, y_test, **fit_kwargs) elif handle_test == "fit_transform": visualizer.fit_transform(X_test, y_test, **fit_kwargs) elif handle_test == "score": logger.info("Scoring test/hold-out set") visualizer.score(X_test, y_test) display.move_progress() display.clear_output() if save: logger.info(f"Saving '{name}.png' in current active directory") visualizer.show(outpath=f"{name}.png", clear_figure=True) else: if display_format == "streamlit": show_yellowbrick_in_streamlit(visualizer, clear_figure=True) else: visualizer.show(clear_figure=True) logger.info("Visual Rendered Successfully")
def __create_resplots( self, model, x: np.ndarray, y: np.ndarray, x_test: np.ndarray = None, y_test: np.ndarray = None, ) -> widgets.VBox: logger = get_logger() with fit_if_not_fitted(model, x, y) as fitted_model: fitted = fitted_model.predict(x) fitted_residuals = fitted - y if x_test is not None and y_test is not None: pred = fitted_model.predict(x_test) prediction_residuals = pred - y_test predictions = np.concatenate((fitted, pred)) residuals = np.concatenate((fitted_residuals, prediction_residuals)) split_origin = np.concatenate( (np.repeat("train", fitted.shape[0]), np.repeat("test", pred.shape[0])) ) x = np.concatenate((x, x_test)) y = np.concatenate((y, y_test)) else: predictions = fitted residuals = fitted_residuals split_origin = None logger.info("Calculated model residuals") self.display.move_progress() tukey_anscombe_widget = TukeyAnscombeWidget( predictions, residuals, split_origin=split_origin ) logger.info("Calculated Tunkey-Anscombe Plot") self.figures.append(tukey_anscombe_widget) self.display.move_progress() qq_plot_widget = QQPlotWidget( predictions, y, split_origin=split_origin, featuresize=x.shape[1] ) logger.info("Calculated Normal QQ Plot") self.figures.append(qq_plot_widget) self.display.move_progress() standardized_residuals = helper.calculate_standardized_residual( predictions, y, None ) model_norm_residuals_abs_sqrt = np.sqrt(np.abs(standardized_residuals)) scale_location_widget = ScaleLocationWidget( predictions, model_norm_residuals_abs_sqrt, split_origin=split_origin ) logger.info("Calculated Scale-Location Plot") self.figures.append(scale_location_widget) self.display.move_progress() leverage = helper.leverage_statistic(np.array(x)) n_model_params = len(model.get_params()) distance = helper.cooks_distance( standardized_residuals, leverage, n_model_params=n_model_params ) cooks_distance_widget = CooksDistanceWidget( leverage, distance, standardized_residuals, n_model_params, split_origin=split_origin, ) logger.info("Calculated Residual vs Leverage Plot inc. Cook's distance") self.figures.append(cooks_distance_widget) self.display.move_progress() items_layout = Layout(width="1000px") h0 = widgets.HBox(self.figures[:2], layout=items_layout) h1 = widgets.HBox(self.figures[2:], layout=items_layout) return widgets.VBox([h0, h1])