def get_monitoring_tools(X, y):
    """
    determine outlier and distance thresholds
    return thresholds, outlier model(s) and source distributions for distances
    NOTE: for classification the outlier detection on y is not needed
    """

    preprocessor = get_preprocessor()
    preprocessor = preprocessor.fit(X)
    X_pp = preprocessor.transform(X)

    xpipe = Pipeline(steps=[(
        'pca',
        PCA(2)), ('clf',
                  EllipticEnvelope(random_state=0, contamination=0.01))])
    xpipe.fit(X_pp)

    bs_samples = 1000
    outliers_X = np.zeros(bs_samples)
    wasserstein_X = np.zeros(bs_samples)
    wasserstein_y = np.zeros(bs_samples)

    for b in range(bs_samples):
        n_samples = int(np.round(0.80 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=True).astype(int)
        y_bs = y[subset_indices]
        X_bs = X_pp[subset_indices, :]

        test1 = xpipe.predict(X_bs)
        wasserstein_X[b] = wasserstein_distance(X_pp.flatten(), X_bs.flatten())
        wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten())
        outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size))

    ## determine thresholds as a function of the confidence intervals
    outliers_X.sort()
    outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int(
        0.025 * bs_samples)]

    wasserstein_X.sort()
    wasserstein_X_threshold = wasserstein_X[int(
        0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)]

    wasserstein_y.sort()
    wasserstein_y_threshold = wasserstein_y[int(
        0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)]

    to_return = {
        "outlier_X": np.round(outlier_X_threshold, 1),
        "wasserstein_X": np.round(wasserstein_X_threshold, 2),
        "wasserstein_y": np.round(wasserstein_y_threshold, 2),
        "preprocessor": preprocessor,
        "clf_X": xpipe,
        "X_source": X_pp,
        "y_source": y,
        "latest_X": X,
        "latest_y": y
    }
    return (to_return)
Example #2
0
def main():
    load_dotenv('.env.general')
    config = load_config('config.yml')
    Path(config.logging.handlers.debug_file_handler.filename).parent.mkdir(
        parents=True, exist_ok=True)
    Path(config.logging.handlers.info_file_handler.filename).parent.mkdir(
        parents=True, exist_ok=True)
    logging.config.dictConfig(config.logging)

    _logger.info("Loading the data")
    x, y = load_training_data()
    x_train, x_test, y_train, y_test = split_data(x, y)

    with tempfile.TemporaryDirectory() as td:
        temp_dir = Path(td)
        mlflow.set_experiment(config.experiment.name)

        params = {}
        tags = {}
        metrics = {}
        artifacts = {}

        with mlflow.start_run():
            _logger.info("Fitting the preprocessor")
            preprocessor = get_preprocessor()
            preprocessor.fit(x_train, y_train)

            _logger.info("Preprocessing the training data")
            x_train_prep = preprocessor.transform(x_train)
            x_test_prep = preprocessor.transform(x_test)

            estimator_params, search_space = get_params()

            if search_space is None:
                estimator, estimator_tags, estimator_metrics, estimator_artifacts = train_run(
                    estimator_params=estimator_params,
                    x_train_prep=x_train_prep,
                    y_train=y_train,
                    x_test_prep=x_test_prep,
                    y_test=y_test,
                    temp_dir=temp_dir)

                model = make_pipeline(preprocessor, estimator)
                params.update(
                    {f"estimator_{k}": v
                     for k, v in estimator_params.items()})
                tags.update(
                    {f"estimator_{k}": v
                     for k, v in estimator_tags.items()})
                metrics.update(estimator_metrics)
                artifacts.update(estimator_artifacts)

            else:

                def hyperopt_objective(search_params):
                    # This function is called for each set of hyper-parameters being tested by HyperOpt.
                    run_name = str(len(trials) - 1)
                    ho_params = {}
                    ho_tags = {}
                    ho_metrics = {}
                    ho_artifacts = {}

                    search_params = flatten_params(search_params)
                    search_params = prep_params(search_params)
                    ho_estimator_params = estimator_params.copy()
                    ho_estimator_params.update(search_params)

                    with mlflow.start_run(nested=True, run_name=run_name):
                        ho_estimator, ho_estimator_tags, ho_estimator_metrics, ho_estimator_artifacts = train_run(
                            estimator_params=ho_estimator_params,
                            x_train_prep=x_train_prep,
                            y_train=y_train,
                            x_test_prep=x_test_prep,
                            y_test=y_test,
                            temp_dir=temp_dir / run_name)

                        ho_model = make_pipeline(preprocessor, ho_estimator)
                        ho_params.update({
                            f"estimator_{k}": v
                            for k, v in ho_estimator_params.items()
                        })
                        ho_tags.update({
                            f"estimator_{k}": v
                            for k, v in ho_estimator_tags.items()
                        })
                        ho_metrics.update(ho_estimator_metrics)
                        ho_artifacts.update(ho_estimator_artifacts)

                        ho_tags['hyperopt'] = True

                        log_sk_model(ho_model,
                                     registered_model_name=None,
                                     params=ho_params,
                                     tags=ho_tags,
                                     metrics=ho_metrics,
                                     artifacts=ho_artifacts)

                    loss = 1 - ho_metrics[config.evaluation.primary_metric]

                    return {
                        'loss': loss,
                        'status': STATUS_OK,
                        'model': ho_model,
                        'params': ho_params,
                        'tags': ho_tags,
                        'metrics': ho_metrics,
                        'artifacts': ho_artifacts
                    }

                trials = Trials()
                fmin(fn=hyperopt_objective,
                     space=search_space,
                     algo=tpe.suggest,
                     trials=trials,
                     max_evals=config.training.max_evals,
                     rstate=np.random.RandomState(1),
                     show_progressbar=False)

                model = trials.best_trial['result']['model']
                params = trials.best_trial['result']['params']
                tags = trials.best_trial['result']['tags']
                metrics = trials.best_trial['result']['metrics']
                artifacts = trials.best_trial['result']['artifacts']

            if config.evaluation.shap_analysis:
                _logger.info("Starting shap analysis")
                shap_tags, shap_artifacts = shap_analyse(
                    model=model, x=x_train, temp_dir=Path(temp_dir) / 'shap')
                tags.update(shap_tags)
                artifacts.update(shap_artifacts)
            else:
                _logger.info("Shap analysis skipped")

            log_sk_model(model,
                         registered_model_name=None,
                         params=params,
                         tags=tags,
                         metrics=metrics,
                         artifacts=artifacts)

    return (x_train, y_train, x_test,
            y_test), model, params, tags, metrics, artifacts
Example #3
0
def get_monitoring_tools(X, y):
    """
    determine outlier and distance thresholds
    return thresholds, outlier model(s) and source distributions for distances
    NOTE: for classification the outlier detection on y is not needed
    """

    preprocessor = get_preprocessor()
    preprocessor = preprocessor.fit(X)
    X_pp = preprocessor.transform(X)

    ## fits an ellypsis (a multivariate gaussian distribution) to your
    ## data and classifiy the number of outliers according to the
    ## contamination parameter here 1% of the data.
    xpipe = Pipeline(steps=[(
        'pca',
        PCA(2)), ('clf',
                  EllipticEnvelope(random_state=0, contamination=0.01))])
    xpipe.fit(X_pp)

    bs_samples = 1000
    outliers_X = np.zeros(bs_samples)
    wasserstein_X = np.zeros(bs_samples)
    wasserstein_y = np.zeros(bs_samples)

    for b in range(bs_samples):
        # get subsample of the data
        n_samples = int(np.round(0.80 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=True).astype(int)
        y_bs = y[subset_indices]
        X_bs = X_pp[subset_indices, :]

        ## get the outliers according to the ellipsis method fitted to
        ## the entire dataset.
        test1 = xpipe.predict(X_bs)

        ## get the wasserstein distance among the distribution of the
        ## entire sample and the random subsample distribution.
        wasserstein_X[b] = wasserstein_distance(X_pp.flatten(), X_bs.flatten())
        wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten())

        ## get the percentage of outliers
        outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size))

    ## determine thresholds as a function of the confidence intervals
    outliers_X.sort()

    print(outliers_X.sort())
    print(outliers_X)

    # get the threshold as some sort of interquantile range
    outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int(
        0.025 * bs_samples)]

    wasserstein_X.sort()
    print(wasserstein_X.sort())
    # get the quantile of the wasserstein threshold according to the
    # distribution of the wasserstein distance among the subsamples.
    wasserstein_X_threshold = wasserstein_X[int(
        0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)]

    wasserstein_y.sort()
    wasserstein_y_threshold = wasserstein_y[int(
        0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)]

    to_return = {
        "outlier_X": np.round(outlier_X_threshold, 1),
        "wasserstein_X": np.round(wasserstein_X_threshold, 2),
        "wasserstein_y": np.round(wasserstein_y_threshold, 2),
        "preprocessor": preprocessor,
        "clf_X": xpipe,
        "X_source": X_pp,
        "y_source": y,
        "latest_X": X,
        "latest_y": y
    }
    return (to_return)