Example #1
0
def uniform_reconstruction_error_cost(config, y_true, y_pred,
                                      **kwargs) -> EvaluationResult:
    reconstruction_errors = compute_reconstruction_error(y_true, y_pred)
    reconstruction_error_cost = np.sqrt(
        np.average(np.power(reconstruction_errors, 2)))

    reconstruction_error_z_scores = z_score(reconstruction_errors)
    reconstruction_error_z_score_cost = np.sqrt(
        np.average(np.power(reconstruction_error_z_scores, 2)))

    cost = reconstruction_error_cost * reconstruction_error_z_score_cost

    return EvaluationResult(cost=cost,
                            config=config,
                            info={
                                'reconstruction_error_cost':
                                reconstruction_error_cost,
                                'reconstruction_error_z_score_cost':
                                reconstruction_error_z_score_cost,
                            })
Example #2
0
def uniform_reconstruction_error_vs_compression_cost(
        config, y_true, y_pred, model: Model, history: dict,
        **kwargs) -> EvaluationResult:
    compression = compute_model_compression(model)
    reconstruction_error = reconstruction_error_cost(config, y_true, y_pred,
                                                     **kwargs).cost
    reconstruction_error_z_scores = z_score(
        compute_reconstruction_error(y_true, y_pred))
    reconstruction_error_z_score_cost = np.sqrt(
        np.average(np.power(reconstruction_error_z_scores, 2)))

    cost = (reconstruction_error *
            reconstruction_error_z_score_cost) / compression

    return EvaluationResult(cost=cost,
                            config=config,
                            info={
                                'reconstruction_error': reconstruction_error,
                                'reconstruction_error_z_score':
                                reconstruction_error_z_score_cost,
                                'compression': compression,
                            })
Example #3
0
    def run_callable(run_config: dict):
        experiment.print('Loading data')
        bearing_dataset = load_data(run_config['data_set'])
        train = bearing_dataset.train(column=config['data_column'],
                                      as_numpy=True)
        test = bearing_dataset.test(column=config['data_column'],
                                    as_numpy=True)
        test_labels = bearing_dataset.test(column=config['data_column'],
                                           add_label=True)['label']
        threshold_percentile = config['threshold_percentile']
        x_train, x_valid, y_train, y_valid = train_test_split(
            train,
            train,
            test_size=config['validation_split'],
            shuffle=True,
        )
        model = KerasModel(
            create_model_function=create_deep_easing_feed_forward_autoencoder,
            evaluation_function=load_from_module(
                run_config['evaluation_function']),
        )
        history = pd.DataFrame(columns=[
            'cost', 'auc', 'accuracy', 'precision', 'recall', 'f_score',
            'matthews_cc'
        ])

        for i in range(1, config['num_evaluations']):
            experiment.print(
                f'Evaluating configuration {i} of {config["num_evaluations"]}')
            current_config = dict(config_space.sample_configuration())

            model.load_config(current_config)
            evaluation_result = model.evaluate(x_train,
                                               y_train,
                                               x_valid,
                                               y_valid,
                                               budget=config['budget'])

            train_reconstruction_error = compute_reconstruction_error(
                y_train, model.predict(x_train))
            train_z_scores = z_score(train_reconstruction_error)
            anomaly_threshold = percentile(train_z_scores,
                                           threshold_percentile)

            test_prediction = model.predict(test)
            test_reconstruction_error = compute_reconstruction_error(
                test, test_prediction)
            test_z_scores = z_score(
                test_reconstruction_error,
                given_median=median(train_reconstruction_error),
                given_mad=mad(train_reconstruction_error))

            if np.isnan(np.sum(test_reconstruction_error)):
                experiment.print(
                    'Got a NaN value in test reconstruction error, skip this evaluation.'
                )
                continue

            anomaly_prediction = (np.array(test_z_scores) >
                                  anomaly_threshold).astype(int)
            metrics = compute_classification_metrics(test_labels.values,
                                                     anomaly_prediction)
            roc = compute_roc(test_labels.values, test_reconstruction_error)

            history_record = {
                'cost': evaluation_result.cost,
                'auc': roc['auc'],
                'accuracy': metrics['accuracy'],
                'precision': metrics['precision'],
                'recall': metrics['recall'],
                'f_score': metrics['f_score'],
                'matthews_cc': metrics['matthews_cc'],
                **{f'info_{k}': v
                   for k, v in evaluation_result.info.items()},
                **{f'config_{k}': v
                   for k, v in current_config.items()}
            }

            history = history.append(history_record, ignore_index=True)

            experiment.log('history', history)
Example #4
0
    def log_keras_predictions(self,
                              model: Union[Model, KerasModel],
                              data_frames: Dict[str, DataFrame],
                              labels: Dict[str, Series] = None,
                              key: str = None,
                              pre_processing: Callable = None,
                              pre_processing_x: Callable = None,
                              pre_processing_y: Callable = None,
                              rolling_window_size: int = 200,
                              log_samples: List[int] = [0, -1],
                              has_multiple_features: bool = False,
                              threshold_percentile: int = 99):
        self.print('Logging predictions')

        log_base_path = 'predictions' if key is None else f'predictions/{key}'
        pre_processing_x = pre_processing_x if pre_processing_x is not None else pre_processing
        pre_processing_y = pre_processing_y if pre_processing_y is not None else pre_processing
        train_reconstruction_error = None
        train_reconstruction_error_rolling = None
        train_z_scores = None
        train_z_scores_rolling = None

        if 'train' in data_frames:
            train_data_frame = data_frames['train']
            train_samples_x = pre_processing_x(
                train_data_frame
            ) if pre_processing_x is not None else train_data_frame.to_numpy()
            train_samples_y = pre_processing_y(
                train_data_frame
            ) if pre_processing_y is not None else train_data_frame.to_numpy()

            train_reconstruction_error = compute_reconstruction_error(
                train_samples_y,
                model.predict(train_samples_x),
                has_multiple_features=has_multiple_features)
            train_reconstruction_error_median = median(
                train_reconstruction_error)
            train_reconstruction_error_mad = mad(train_reconstruction_error)
            train_z_scores = list(
                map(
                    lambda x: z_score(x, train_reconstruction_error_median,
                                      train_reconstruction_error_mad),
                    train_reconstruction_error))

            train_reconstruction_error_rolling = np.convolve(
                train_reconstruction_error,
                np.ones(rolling_window_size) / rolling_window_size,
                mode='valid')
            train_reconstruction_error_rolling_median = median(
                train_reconstruction_error_rolling)
            train_reconstruction_error_rolling_mad = mad(
                train_reconstruction_error_rolling)
            train_z_scores_rolling = list(
                map(
                    lambda x: z_score(
                        x, train_reconstruction_error_rolling_median,
                        train_reconstruction_error_rolling_mad),
                    train_reconstruction_error_rolling))

        for data_frame_key, data_frame in data_frames.items():
            data_frame = data_frame.copy()
            data_frame_log_path = f'{log_base_path}/{data_frame_key}'

            try:
                samples_x = pre_processing_x(
                    data_frame
                ) if pre_processing_x is not None else data_frame.to_numpy()
                samples_y = pre_processing_y(
                    data_frame
                ) if pre_processing_y is not None else data_frame.to_numpy()

                reconstruction = model.predict(samples_x)
                reconstruction_error = compute_reconstruction_error(
                    samples_y,
                    reconstruction,
                    has_multiple_features=has_multiple_features)

                if len(data_frame.index) != len(reconstruction_error):
                    cut_rows = len(reconstruction_error) - len(
                        data_frame.index)
                    data_frame = data_frame.iloc[:cut_rows].copy()

                data_frame['reconstruction_error'] = reconstruction_error
                data_frame['reconstruction_error_rolling'] = data_frame[
                    'reconstruction_error'].rolling(
                        window=rolling_window_size).median().fillna(
                            method='backfill')
                data_frame['z_score'] = list(
                    map(
                        lambda x: z_score(x, train_reconstruction_error_median,
                                          train_reconstruction_error_mad),
                        reconstruction_error))
                data_frame['z_score_rolling'] = list(
                    map(
                        lambda x: z_score(
                            x, train_reconstruction_error_rolling_median,
                            train_reconstruction_error_rolling_mad),
                        data_frame['reconstruction_error_rolling'].values))

                self.plot(f'{data_frame_log_path}/reconstruction_error',
                          x=data_frame.index,
                          y=reconstruction_error,
                          label='reconstruction error',
                          time_formatting=True,
                          close=False)
                self.plot(
                    f'{data_frame_log_path}/reconstruction_error_rolling',
                    x=data_frame.index,
                    y=data_frame['reconstruction_error_rolling'],
                    label='rolling reconstruction error',
                    time_formatting=True,
                    create_figure=False)

                self.plot(f'{data_frame_log_path}/z_score',
                          x=data_frame.index,
                          y=data_frame['z_score'],
                          label='z-score',
                          time_formatting=True,
                          close=False)
                self.plot(f'{data_frame_log_path}/z_score_rolling',
                          x=data_frame.index,
                          y=data_frame['z_score_rolling'],
                          label='rolling z-score',
                          time_formatting=True,
                          create_figure=False)

                if labels is not None and data_frame_key in labels and len(
                        set(labels[data_frame_key].values)) > 1:
                    roc = compute_roc(labels[data_frame_key].values,
                                      reconstruction_error)
                    roc_rolling = compute_roc(
                        labels[data_frame_key].values,
                        data_frame['reconstruction_error_rolling'].values)

                    self.log(f'{data_frame_log_path}/roc/auc', roc['auc'])
                    self.log(f'{data_frame_log_path}/roc/data',
                             roc,
                             to_pickle=True)
                    self.plot_roc(f'{data_frame_log_path}/roc/fpr_tpr',
                                  roc['fpr'], roc['tpr'])

                    self.log(f'{data_frame_log_path}/roc/auc_rolling',
                             roc_rolling['auc'])
                    self.log(f'{data_frame_log_path}/roc/data_rolling',
                             roc_rolling,
                             to_pickle=True)
                    self.plot_roc(f'{data_frame_log_path}/roc/fpr_tpr_rolling',
                                  roc_rolling['fpr'], roc_rolling['tpr'])

                if train_reconstruction_error is not None:
                    data_frame['health_score'] = compute_health_score(
                        train_reconstruction_error, reconstruction_error)
                    data_frame['health_score_rolling'] = data_frame[
                        'health_score'].rolling(
                            window=rolling_window_size).median().fillna(
                                method='backfill')
                    log_metrics = [
                        'reconstruction_error', 'reconstruction_error_rolling',
                        'health_score', 'health_score_rolling'
                    ]

                    if labels is not None and data_frame_key in labels and len(
                            set(labels[data_frame_key].values)) > 1:
                        threshold = percentile(train_z_scores,
                                               threshold_percentile)
                        rolling_threshold = percentile(train_z_scores_rolling,
                                                       threshold_percentile)
                        prediction = (data_frame['z_score'] >
                                      threshold).astype(int)
                        prediction_rolling = (data_frame['z_score_rolling'] >
                                              rolling_threshold).astype(int)
                        metrics = compute_classification_metrics(
                            labels[data_frame_key].values, prediction)
                        metrics_rolling = compute_classification_metrics(
                            labels[data_frame_key].values, prediction_rolling)

                        self.log(
                            f'{data_frame_log_path}/classification/thresholds',
                            {
                                'threshold_percentile': threshold_percentile,
                                'threshold': threshold,
                                'rolling_threshold': rolling_threshold
                            })
                        self.log(
                            f'{data_frame_log_path}/classification/metrics',
                            metrics)
                        self.log(
                            f'{data_frame_log_path}/classification/metrics_rolling',
                            metrics_rolling)
                        log_metrics.append('z_score')
                        log_metrics.append('z_score_rolling')

                    self.plot(
                        f'{data_frame_log_path}/health_score/health_score',
                        x=data_frame.index,
                        y=data_frame['health_score'],
                        label='health score',
                        ylim=[0, 1],
                        time_formatting=True,
                        close=False)
                    self.plot(
                        f'{data_frame_log_path}/health_score/health_score_rolling',
                        x=data_frame.index,
                        y=data_frame['health_score'].rolling(
                            window=rolling_window_size).median().fillna(
                                method='backfill'),
                        label='rolling health score',
                        ylim=[0, 1],
                        time_formatting=True,
                        create_figure=False)

                    self.log(f'{data_frame_log_path}/metrics',
                             data_frame[log_metrics])
                else:
                    self.log(
                        f'{data_frame_log_path}/metrics', data_frame[[
                            'reconstruction_error',
                            'reconstruction_error_rolling', 'z_score',
                            'z_score_rolling'
                        ]])

                for sample_index in log_samples:
                    ylim = [0, 1] if all(
                        0.0 <= value <= 1.0
                        for value in samples_y[sample_index] +
                        reconstruction[sample_index]) else None
                    sample_log_path = f'{data_frame_log_path}/samples/sample_{sample_index}'

                    self.plot(f'{sample_log_path}/input',
                              y=samples_y[sample_index],
                              ylim=ylim,
                              label='input',
                              close=False)
                    self.plot(f'{sample_log_path}/reconstruction',
                              y=reconstruction[sample_index],
                              ylim=ylim,
                              label='reconstruction',
                              create_figure=False)

                    self.log(
                        f'{sample_log_path}/data',
                        pd.DataFrame.from_dict({
                            'input':
                            samples_y[sample_index],
                            'reconstruction':
                            reconstruction[sample_index]
                        }))

                    plot_model_layer_activations(
                        model=model
                        if isinstance(model, Model) else model.model,
                        sample=samples_x[sample_index],
                        out_path=self._out_path(
                            f'{sample_log_path}/activations/',
                            is_directory=True))
            except ValueError:
                self.log(f'{data_frame_log_path}/error',
                         traceback.format_exc())