Example #1
0
 def test_get_column_order(self):
     n_points = 10000
     df_in = sample_utils.get_pos_sample_synthetic(mean=[0, 1, 2],
                                                   cov=np.eye(3),
                                                   n_points=n_points)
     df_in = df_in.drop(columns=['class_label'])
     # Get the normalization info from the data frame.
     normalization_info = sample_utils.get_normalization_info(df_in)
     column_order = sample_utils.get_column_order(normalization_info)
     assert column_order == ['x001', 'x002', 'x003']
    def train_model(self, x_train: pd.DataFrame) -> None:
        """Train a new model and report the loss and accuracy.

    Args:
      x_train: dataframe with dimensions as columns.
    """
        self._normalization_info = sample_utils.get_normalization_info(x_train)
        column_order = sample_utils.get_column_order(self._normalization_info)
        normalized_x_train = sample_utils.normalize(x_train,
                                                    self._normalization_info)

        normalized_training_sample = sample_utils.apply_negative_sample(
            positive_sample=normalized_x_train,
            sample_ratio=self._sample_ratio,
            sample_delta=self._sample_delta)

        x = np.float32(np.matrix(normalized_training_sample[column_order]))
        y = np.float32(np.array(normalized_training_sample['class_label']))
        # create dataset objects from the arrays
        dx = tf.data.Dataset.from_tensor_slices(x)
        dy = tf.data.Dataset.from_tensor_slices(y)

        logging.info('Training ns-nn with:')
        logging.info(normalized_training_sample['class_label'].value_counts())

        # zip the two datasets together
        train_dataset = tf.data.Dataset.zip(
            (dx,
             dy)).shuffle(_SHUFFLE_BUFFERSIZE).repeat().batch(self._batch_size)

        if self._tpu_worker:
            resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
                self._tpu_worker)
            tf.contrib.distribute.initialize_tpu_system(resolver)
            strategy = tf.contrib.distribute.TPUStrategy(resolver)
            with strategy.scope():
                self._model = self._get_model(x.shape[1], self._dropout,
                                              self._layer_width,
                                              self._n_hidden_layers)
        else:
            self._model = self._get_model(x.shape[1], self._dropout,
                                          self._layer_width,
                                          self._n_hidden_layers)

        self._model.fit(x=train_dataset,
                        steps_per_epoch=self._steps_per_epoch,
                        verbose=0,
                        epochs=self._epochs,
                        callbacks=[
                            tf.keras.callbacks.TensorBoard(
                                log_dir=self._log_dir,
                                histogram_freq=1,
                                write_graph=False,
                                write_images=False)
                        ])
Example #3
0
    def train_model(self, x_train: pd.DataFrame) -> None:
        """Trains a OC-SVM Anomaly detector using the positive sample.

    Args:
      x_train: training sample, with numeric feature columns.
    """
        self._normalization_info = sample_utils.get_normalization_info(x_train)
        column_order = sample_utils.get_column_order(self._normalization_info)
        normalized_x_train = sample_utils.normalize(x_train[column_order],
                                                    self._normalization_info)
        super(OneClassSVMAd, self).fit(X=normalized_x_train)
    def predict(self, sample_df: pd.DataFrame) -> pd.DataFrame:
        """Given new data, predict the probability of being positive class.

    Args:
      sample_df: dataframe with features as columns, same as train().

    Returns:
      DataFrame as sample_df, with colum 'class_prob', prob of Normal class.
    """

        sample_df_normalized = sample_utils.normalize(sample_df,
                                                      self._normalization_info)
        column_order = sample_utils.get_column_order(self._normalization_info)
        x = np.float32(np.matrix(sample_df_normalized[column_order]))
        y_hat = self._model.predict(x, verbose=1, steps=1)
        sample_df['class_prob'] = y_hat
        return sample_df
Example #5
0
    def predict(self, sample_df: pd.DataFrame) -> pd.DateOffset:
        """Performs anomaly detection on a new sample.

    Args:
      sample_df: dataframe with the new datapoints.

    Returns:
      original dataframe with a new column labled 'class_prob' as 1.0
      for normal to 0.0 for anomalous.
    """
        sample_df_normalized = sample_utils.normalize(sample_df,
                                                      self._normalization_info)
        column_order = sample_utils.get_column_order(self._normalization_info)
        x_test = np.float32(np.matrix(sample_df_normalized[column_order]))
        preds = super(OneClassSVMAd, self).predict(x_test)
        sample_df['class_prob'] = np.where(preds == -1, 0, preds)
        return sample_df
Example #6
0
    def predict(self, sample_df: pd.DataFrame) -> pd.DataFrame:
        """Performs anomaly detection on a new sample.

    Args:
      sample_df: dataframe with the new datapoints, not normalized.

    Returns:
      original dataframe with a new column labled 'class_prob' rangin from 1.0
      as normal to 0.0 as anomalous.
    """

        sample_df_normalized = sample_utils.normalize(sample_df,
                                                      self._normalization_info)
        column_order = sample_utils.get_column_order(self._normalization_info)
        x = np.float32(np.matrix(sample_df_normalized[column_order]))

        preds = super(NegativeSamplingRandomForestAd, self).predict_proba(x)
        sample_df['class_prob'] = preds[:, _NORMAL_CLASS]
        return sample_df
Example #7
0
    def train_model(self, x_train: pd.DataFrame) -> None:
        """Trains a NS-NN Anomaly detector using the positive sample.

    Args:
      x_train: training sample, which does not need to be normalized.
    """
        # TODO(sipple) Consolidate the normalization code into the base class.
        self._normalization_info = sample_utils.get_normalization_info(x_train)
        column_order = sample_utils.get_column_order(self._normalization_info)
        normalized_x_train = sample_utils.normalize(x_train[column_order],
                                                    self._normalization_info)

        normalized_training_sample = sample_utils.apply_negative_sample(
            positive_sample=normalized_x_train,
            sample_ratio=self._sample_ratio,
            sample_delta=self._sample_delta)

        super(NegativeSamplingRandomForestAd,
              self).fit(X=normalized_training_sample[column_order],
                        y=normalized_training_sample[_CLASS_LABEL])