def get_monitoring_tools(X, y): """ determine outlier and distance thresholds return thresholds, outlier model(s) and source distributions for distances NOTE: for classification the outlier detection on y is not needed """ preprocessor = get_preprocessor() preprocessor = preprocessor.fit(X) X_pp = preprocessor.transform(X) xpipe = Pipeline(steps=[( 'pca', PCA(2)), ('clf', EllipticEnvelope(random_state=0, contamination=0.01))]) xpipe.fit(X_pp) bs_samples = 1000 outliers_X = np.zeros(bs_samples) wasserstein_X = np.zeros(bs_samples) wasserstein_y = np.zeros(bs_samples) for b in range(bs_samples): n_samples = int(np.round(0.80 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=True).astype(int) y_bs = y[subset_indices] X_bs = X_pp[subset_indices, :] test1 = xpipe.predict(X_bs) wasserstein_X[b] = wasserstein_distance(X_pp.flatten(), X_bs.flatten()) wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten()) outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size)) ## determine thresholds as a function of the confidence intervals outliers_X.sort() outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int( 0.025 * bs_samples)] wasserstein_X.sort() wasserstein_X_threshold = wasserstein_X[int( 0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)] wasserstein_y.sort() wasserstein_y_threshold = wasserstein_y[int( 0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)] to_return = { "outlier_X": np.round(outlier_X_threshold, 1), "wasserstein_X": np.round(wasserstein_X_threshold, 2), "wasserstein_y": np.round(wasserstein_y_threshold, 2), "preprocessor": preprocessor, "clf_X": xpipe, "X_source": X_pp, "y_source": y, "latest_X": X, "latest_y": y } return (to_return)
def main(): load_dotenv('.env.general') config = load_config('config.yml') Path(config.logging.handlers.debug_file_handler.filename).parent.mkdir( parents=True, exist_ok=True) Path(config.logging.handlers.info_file_handler.filename).parent.mkdir( parents=True, exist_ok=True) logging.config.dictConfig(config.logging) _logger.info("Loading the data") x, y = load_training_data() x_train, x_test, y_train, y_test = split_data(x, y) with tempfile.TemporaryDirectory() as td: temp_dir = Path(td) mlflow.set_experiment(config.experiment.name) params = {} tags = {} metrics = {} artifacts = {} with mlflow.start_run(): _logger.info("Fitting the preprocessor") preprocessor = get_preprocessor() preprocessor.fit(x_train, y_train) _logger.info("Preprocessing the training data") x_train_prep = preprocessor.transform(x_train) x_test_prep = preprocessor.transform(x_test) estimator_params, search_space = get_params() if search_space is None: estimator, estimator_tags, estimator_metrics, estimator_artifacts = train_run( estimator_params=estimator_params, x_train_prep=x_train_prep, y_train=y_train, x_test_prep=x_test_prep, y_test=y_test, temp_dir=temp_dir) model = make_pipeline(preprocessor, estimator) params.update( {f"estimator_{k}": v for k, v in estimator_params.items()}) tags.update( {f"estimator_{k}": v for k, v in estimator_tags.items()}) metrics.update(estimator_metrics) artifacts.update(estimator_artifacts) else: def hyperopt_objective(search_params): # This function is called for each set of hyper-parameters being tested by HyperOpt. run_name = str(len(trials) - 1) ho_params = {} ho_tags = {} ho_metrics = {} ho_artifacts = {} search_params = flatten_params(search_params) search_params = prep_params(search_params) ho_estimator_params = estimator_params.copy() ho_estimator_params.update(search_params) with mlflow.start_run(nested=True, run_name=run_name): ho_estimator, ho_estimator_tags, ho_estimator_metrics, ho_estimator_artifacts = train_run( estimator_params=ho_estimator_params, x_train_prep=x_train_prep, y_train=y_train, x_test_prep=x_test_prep, y_test=y_test, temp_dir=temp_dir / run_name) ho_model = make_pipeline(preprocessor, ho_estimator) ho_params.update({ f"estimator_{k}": v for k, v in ho_estimator_params.items() }) ho_tags.update({ f"estimator_{k}": v for k, v in ho_estimator_tags.items() }) ho_metrics.update(ho_estimator_metrics) ho_artifacts.update(ho_estimator_artifacts) ho_tags['hyperopt'] = True log_sk_model(ho_model, registered_model_name=None, params=ho_params, tags=ho_tags, metrics=ho_metrics, artifacts=ho_artifacts) loss = 1 - ho_metrics[config.evaluation.primary_metric] return { 'loss': loss, 'status': STATUS_OK, 'model': ho_model, 'params': ho_params, 'tags': ho_tags, 'metrics': ho_metrics, 'artifacts': ho_artifacts } trials = Trials() fmin(fn=hyperopt_objective, space=search_space, algo=tpe.suggest, trials=trials, max_evals=config.training.max_evals, rstate=np.random.RandomState(1), show_progressbar=False) model = trials.best_trial['result']['model'] params = trials.best_trial['result']['params'] tags = trials.best_trial['result']['tags'] metrics = trials.best_trial['result']['metrics'] artifacts = trials.best_trial['result']['artifacts'] if config.evaluation.shap_analysis: _logger.info("Starting shap analysis") shap_tags, shap_artifacts = shap_analyse( model=model, x=x_train, temp_dir=Path(temp_dir) / 'shap') tags.update(shap_tags) artifacts.update(shap_artifacts) else: _logger.info("Shap analysis skipped") log_sk_model(model, registered_model_name=None, params=params, tags=tags, metrics=metrics, artifacts=artifacts) return (x_train, y_train, x_test, y_test), model, params, tags, metrics, artifacts
def get_monitoring_tools(X, y): """ determine outlier and distance thresholds return thresholds, outlier model(s) and source distributions for distances NOTE: for classification the outlier detection on y is not needed """ preprocessor = get_preprocessor() preprocessor = preprocessor.fit(X) X_pp = preprocessor.transform(X) ## fits an ellypsis (a multivariate gaussian distribution) to your ## data and classifiy the number of outliers according to the ## contamination parameter here 1% of the data. xpipe = Pipeline(steps=[( 'pca', PCA(2)), ('clf', EllipticEnvelope(random_state=0, contamination=0.01))]) xpipe.fit(X_pp) bs_samples = 1000 outliers_X = np.zeros(bs_samples) wasserstein_X = np.zeros(bs_samples) wasserstein_y = np.zeros(bs_samples) for b in range(bs_samples): # get subsample of the data n_samples = int(np.round(0.80 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=True).astype(int) y_bs = y[subset_indices] X_bs = X_pp[subset_indices, :] ## get the outliers according to the ellipsis method fitted to ## the entire dataset. test1 = xpipe.predict(X_bs) ## get the wasserstein distance among the distribution of the ## entire sample and the random subsample distribution. wasserstein_X[b] = wasserstein_distance(X_pp.flatten(), X_bs.flatten()) wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten()) ## get the percentage of outliers outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size)) ## determine thresholds as a function of the confidence intervals outliers_X.sort() print(outliers_X.sort()) print(outliers_X) # get the threshold as some sort of interquantile range outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int( 0.025 * bs_samples)] wasserstein_X.sort() print(wasserstein_X.sort()) # get the quantile of the wasserstein threshold according to the # distribution of the wasserstein distance among the subsamples. wasserstein_X_threshold = wasserstein_X[int( 0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)] wasserstein_y.sort() wasserstein_y_threshold = wasserstein_y[int( 0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)] to_return = { "outlier_X": np.round(outlier_X_threshold, 1), "wasserstein_X": np.round(wasserstein_X_threshold, 2), "wasserstein_y": np.round(wasserstein_y_threshold, 2), "preprocessor": preprocessor, "clf_X": xpipe, "X_source": X_pp, "y_source": y, "latest_X": X, "latest_y": y } return (to_return)