def test_lambda_values(X_n140_outliers) -> None: """ Test to ensure results are returned which correspond to what is expected when varying the extent parameter (we expect larger extent values to result in more constrained scores). :param X_n140_outliers: A pytest Fixture that generates 140 observations. :return: None """ # Fit the model with different extent (lambda) values clf1 = loop.LocalOutlierProbability(X_n140_outliers, extent=1, use_numba=NUMBA) clf2 = loop.LocalOutlierProbability(X_n140_outliers, extent=2, use_numba=NUMBA) clf3 = loop.LocalOutlierProbability(X_n140_outliers, extent=3, use_numba=NUMBA) # predict scores (the lower, the more normal) score1 = clf1.fit().local_outlier_probabilities score2 = clf2.fit().local_outlier_probabilities score3 = clf3.fit().local_outlier_probabilities # Get the mean of all the scores score_mean1 = np.mean(score1) score_mean2 = np.mean(score2) score_mean3 = np.mean(score3) # check that expected the means align with expectation assert_greater(score_mean1, score_mean2) assert_greater(score_mean2, score_mean3)
def test_stream_performance(X_n140_outliers) -> None: """ Test to ensure that the streaming approach works as desired when using a regular set of input data (no distance and neighbor matrices) and that the result is within some expected level of error when compared to the classical approach. :param X_n140_outliers: A pytest Fixture that generates 140 observations. :return: """ X_train = X_n140_outliers[0:100] X_test = X_n140_outliers[100:140] # Fit the models in standard and stream form m = loop.LocalOutlierProbability(X_n140_outliers, use_numba=NUMBA).fit() scores_noclust = m.local_outlier_probabilities m_train = loop.LocalOutlierProbability(X_train, use_numba=NUMBA) m_train.fit() X_train_scores = m_train.local_outlier_probabilities X_test_scores = [] for idx in range(X_test.shape[0]): X_test_scores.append(m_train.stream(X_test[idx])) X_test_scores = np.array(X_test_scores) stream_scores = np.hstack((X_train_scores, X_test_scores)) # calculate the rmse and ensure score is below threshold rmse = np.sqrt(((scores_noclust - stream_scores)**2).mean(axis=None)) assert_greater(0.35, rmse)
def test_loop(X_n8) -> None: """ Tests the basic functionality and asserts that the anomalous observations are detected as anomalies. Tests the functionality using inputs as Numpy arrays and as Pandas dataframes. :param X_n8: A pytest Fixture that generates the 8 observations. :return: None """ # Test LocalOutlierProbability: clf = loop.LocalOutlierProbability(X_n8, n_neighbors=5, use_numba=NUMBA) score = clf.fit().local_outlier_probabilities share_outlier = 2. / 8. predictions = [-1 if s > share_outlier else 1 for s in score] assert_array_equal(predictions, 6 * [1] + 2 * [-1]) # Assert smallest outlier score is greater than largest inlier score: assert_greater(np.min(score[-2:]), np.max(score[:-2])) # Test the DataFrame functionality X_df = pd.DataFrame(X_n8) # Test LocalOutlierProbability: clf = loop.LocalOutlierProbability(X_df, n_neighbors=5, use_numba=NUMBA) score = clf.fit().local_outlier_probabilities share_outlier = 2. / 8. predictions = [-1 if s > share_outlier else 1 for s in score] assert_array_equal(predictions, 6 * [1] + 2 * [-1]) # Assert smallest outlier score is greater than largest inlier score: assert_greater(np.min(score[-2:]), np.max(score[:-2]))
def test_lambda_values(): # Generate train/test data rng = check_random_state(2) X = 0.3 * rng.randn(120, 2) # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) X_test = np.r_[X, X_outliers] # Fit the model with different extent (lambda) values clf1 = loop.LocalOutlierProbability(X_test, extent=1) clf2 = loop.LocalOutlierProbability(X_test, extent=2) clf3 = loop.LocalOutlierProbability(X_test, extent=3) # predict scores (the lower, the more normal) score1 = clf1.fit().local_outlier_probabilities score2 = clf2.fit().local_outlier_probabilities score3 = clf3.fit().local_outlier_probabilities # Get the mean of all the scores score_mean1 = np.mean(score1) score_mean2 = np.mean(score2) score_mean3 = np.mean(score3) # check that expected the means align with expectation assert_greater(score_mean1, score_mean2) assert_greater(score_mean2, score_mean3)
def test_stream_performance(): # Generate train/test data rng = check_random_state(2) X = 0.3 * rng.randn(120, 2) # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) X = np.r_[X, X_outliers] X_train = X[0:100] X_test = X[100:140] # Fit the models in standard and stream form m = loop.LocalOutlierProbability(X).fit() scores_noclust = m.local_outlier_probabilities m_train = loop.LocalOutlierProbability(X_train) m_train.fit() X_train_scores = m_train.local_outlier_probabilities X_test_scores = [] for idx in range(X_test.shape[0]): X_test_scores.append(m_train.stream(X_test[idx])) X_test_scores = np.array(X_test_scores) stream_scores = np.hstack((X_train_scores, X_test_scores)) # calculate the rmse and ensure score is below threshold rmse = np.sqrt(((scores_noclust - stream_scores)**2).mean(axis=None)) assert_greater(0.35, rmse)
def test_stream_distance(X_n140_outliers) -> None: X_train = X_n140_outliers[0:100] X_test = X_n140_outliers[100:140] # generate distance and neighbor indices neigh = NearestNeighbors(metric='euclidean') neigh.fit(X_train) d, idx = neigh.kneighbors(X_train, n_neighbors=10, return_distance=True) # Fit the models in standard and distance matrix form m = loop.LocalOutlierProbability(X_train).fit() m_dist = loop.LocalOutlierProbability(distance_matrix=d, neighbor_matrix=idx).fit() # Collect the scores X_test_scores = [] for i in range(X_test.shape[0]): X_test_scores.append(m.stream(np.array(X_test[i]))) X_test_scores = np.array(X_test_scores) X_test_dist_scores = [] for i in range(X_test.shape[0]): dd, ii = neigh.kneighbors(np.array([X_test[i]]), return_distance=True) X_test_dist_scores.append(m_dist.stream(np.mean(dd))) X_test_dist_scores = np.array(X_test_dist_scores) # calculate the rmse and ensure score is below threshold rmse = np.sqrt(((X_test_scores - X_test_dist_scores)**2).mean(axis=None)) assert_greater(0.075, rmse)
def test_n_neighbors(): X = iris.data clf = loop.LocalOutlierProbability(X, n_neighbors=500).fit() assert_equal(clf.n_neighbors, X.shape[0] - 1) clf = loop.LocalOutlierProbability(X, n_neighbors=500) assert_warns(UserWarning, clf.fit) assert_equal(clf.n_neighbors, X.shape[0] - 1)
def test_n_neighbors() -> None: """ Tests the functionality of providing a large number of neighbors that is greater than the number of observations (software defaults to the data input size and provides a UserWarning). :return: None """ X = iris.data clf = loop.LocalOutlierProbability(X, n_neighbors=500, use_numba=NUMBA).fit() assert_equal(clf.n_neighbors, X.shape[0] - 1) clf = loop.LocalOutlierProbability(X, n_neighbors=500, use_numba=NUMBA) assert_warns(UserWarning, clf.fit) assert_equal(clf.n_neighbors, X.shape[0] - 1)
def test_loop_dist_matrix(X_n120) -> None: # generate distance and neighbor indices neigh = NearestNeighbors(metric='euclidean') neigh.fit(X_n120) d, idx = neigh.kneighbors(X_n120, n_neighbors=10, return_distance=True) # fit loop using data and distance matrix clf1 = loop.LocalOutlierProbability(X_n120) clf2 = loop.LocalOutlierProbability(distance_matrix=d, neighbor_matrix=idx) scores1 = clf1.fit().local_outlier_probabilities scores2 = clf2.fit().local_outlier_probabilities # compare the agreement between the results assert_almost_equal(scores1, scores2, decimal=1)
def test_small_cluster_size(X_n140_outliers) -> None: """ Test to ensure that the program exits when the specified number of neighbors is larger than the smallest cluster size in the input data. :param X_n140_outliers: A pytest Fixture that generates 140 observations. :return: None """ # Generate cluster labels a = [0] * 120 b = [1] * 18 cluster_labels = a + b clf = loop.LocalOutlierProbability(X_n140_outliers, n_neighbors=50, cluster_labels=cluster_labels, use_numba=NUMBA) with pytest.raises(SystemExit) as record_a, pytest.warns( UserWarning) as record_b: clf.fit() assert record_a.type == SystemExit # check that only one warning was raised assert len(record_b) == 1 # check that the message matches assert record_b[0].message.args[ 0] == "Number of neighbors specified larger than smallest " \ "cluster. Specify a number of neighbors smaller than " \ "the smallest cluster size (observations in smallest " \ "cluster minus one)."
def test_loop_performance(X_n120) -> None: """ Using a set of known anomalies (labels), tests the performance (using ROC / AUC score) of the software and ensures it is able to capture most anomalies under this basic scenario. :param X_n120: A pytest Fixture that generates the 120 observations. :return: None """ # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) X_test = np.r_[X_n120, X_outliers] X_labels = np.r_[np.repeat(1, X_n120.shape[0]), np.repeat(-1, X_outliers.shape[0])] # fit the model clf = loop.LocalOutlierProbability( X_test, n_neighbors=X_test.shape[0] - 1, # test the progress bar progress_bar=True, use_numba=NUMBA) # predict scores (the lower, the more normal) score = clf.fit().local_outlier_probabilities share_outlier = X_outliers.shape[0] / X_test.shape[0] X_pred = [-1 if s > share_outlier else 1 for s in score] # check that roc_auc is good assert_greater(roc_auc_score(X_pred, X_labels), .98)
def test_input_too_many(X_n120) -> None: """ Test to ensure that the proper warning is issued if both a data matrix and a distance matrix are provided (can only be data matrix). :param X_n120: A pytest Fixture that generates 120 observations. :return: None """ # generate distance and neighbor indices neigh = NearestNeighbors(metric='euclidean') neigh.fit(X_n120) d, idx = neigh.kneighbors(X_n120, n_neighbors=10, return_distance=True) with pytest.warns(UserWarning) as record: # attempt to fit loop with data and a distance matrix loop.LocalOutlierProbability(X_n120, distance_matrix=d, neighbor_matrix=idx, use_numba=NUMBA) # check that only one warning was raised assert len(record) == 1 # check that the message matches assert record[0].message.args[ 0] == "Only one of the following may be provided: data or a " \ "distance matrix (not both)."
def test_input_neighbor_mismatch(): # Generate train/test data rng = check_random_state(2) X = 0.3 * rng.randn(120, 2) # generate distance and neighbor indices neigh = NearestNeighbors(n_neighbors=5, metric='euclidean') neigh.fit(X) d, idx = neigh.kneighbors(X, return_distance=True) with pytest.warns(UserWarning) as record: warnings.warn( "The shape of the distance or " "neighbor index matrix does not " "match the number of neighbors " "specified.", UserWarning) # attempt to fit loop with only a distance matrix and no neighbor matrix loop.LocalOutlierProbability(distance_matrix=d) # check that only one warning was raised assert len(record) == 1 # check that the message matches assert record[0].message.args[ 0] == "The shape of the distance or " \ "neighbor index matrix does not " \ "match the number of neighbors " \ "specified."
def test_input_shape_mismatch(): # Generate train/test data rng = check_random_state(2) X = 0.3 * rng.randn(120, 2) # generate distance and neighbor indices neigh = NearestNeighbors(n_neighbors=10, metric='euclidean') neigh.fit(X) d, idx = neigh.kneighbors(X, return_distance=True) # generate distance and neighbor indices of a different shape neigh_2 = NearestNeighbors(n_neighbors=5, metric='euclidean') neigh_2.fit(X) d_2, idx_2 = neigh.kneighbors(X, return_distance=True) with pytest.warns(UserWarning) as record: warnings.warn( "The shape of the distance and neighbor " "index matrices must match.", UserWarning) # attempt to fit loop with only a distance matrix and no neighbor matrix loop.LocalOutlierProbability(distance_matrix=d, neighbor_matrix=idx_2) # check that only one warning was raised assert len(record) == 1 # check that the message matches assert record[0].message.args[ 0] == "The shape of the distance and neighbor " \ "index matrices must match."
def test_distance_neighbor_shape_mismatch(X_n120) -> None: """ Test to ensure that the proper warning is issued if there is a mismatch between the shape of the provided distance and neighbor matrices. :param X_n120: A pytest Fixture that generates 120 observations. :return: None """ # generate distance and neighbor indices neigh = NearestNeighbors(metric='euclidean') neigh.fit(X_n120) d, idx = neigh.kneighbors(X_n120, n_neighbors=10, return_distance=True) # generate distance and neighbor indices of a different shape neigh_2 = NearestNeighbors(metric='euclidean') neigh_2.fit(X_n120) d_2, idx_2 = neigh.kneighbors(X_n120, n_neighbors=5, return_distance=True) with pytest.warns(UserWarning) as record: # attempt to fit loop with a mismatch in shapes loop.LocalOutlierProbability(distance_matrix=d, neighbor_matrix=idx_2, n_neighbors=5, use_numba=NUMBA) # check that only one warning was raised assert len(record) == 1 # check that the message matches assert record[0].message.args[ 0] == "The shape of the distance and neighbor " \ "index matrices must match."
def test_input_neighbor_mismatch(X_n120) -> None: """ Test to ensure that the proper warning is issued if the supplied distance (and neighbor) matrix and specified number of neighbors do not match. :param X_n120: A pytest Fixture that generates 120 observations. :return: None """ # generate distance and neighbor indices neigh = NearestNeighbors(metric='euclidean') neigh.fit(X_n120) d, idx = neigh.kneighbors(X_n120, n_neighbors=5, return_distance=True) with pytest.warns(UserWarning) as record: # attempt to fit loop with a neighbor size mismatch loop.LocalOutlierProbability(distance_matrix=d, neighbor_matrix=idx, n_neighbors=10, use_numba=NUMBA) # check that only one warning was raised assert len(record) == 1 # check that the message matches assert record[0].message.args[ 0] == "The shape of the distance or " \ "neighbor index matrix does not " \ "match the number of neighbors " \ "specified."
def test_stream_cluster(X_n140_outliers) -> None: """ Test to ensure that the proper warning is issued if the streaming approach is called on clustered data, as the streaming approach does not support this functionality. :param X_n140_outliers: A pytest Fixture that generates 140 observations. :return: None """ # Generate cluster labels a = [0] * 120 b = [1] * 18 cluster_labels = a + b # Fit the model X_train = X_n140_outliers[0:138] X_test = X_n140_outliers[139] clf = loop.LocalOutlierProbability(X_train, cluster_labels=cluster_labels, use_numba=NUMBA).fit() with pytest.warns(UserWarning) as record: clf.stream(X_test) # check that only one warning was raised assert len(record) == 1 # check that the message matches assert record[0].message.args[ 0] == "Stream approach does not support clustered data. " \ "Automatically refit using single cluster of points."
def test_stream_fit(): # Generate train/test data rng = check_random_state(2) X = 0.3 * rng.randn(120, 2) # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) X = np.r_[X, X_outliers] # Fit the model X_train = X[0:138] X_test = X[139] clf = loop.LocalOutlierProbability(X_train) with pytest.warns(UserWarning) as record: warnings.warn( "Must fit on historical data by calling fit() prior to " "calling stream(x).", UserWarning) clf.stream(X_test) # check that only one warning was raised assert len(record) == 1 # check that the message matches assert record[0].message.args[ 0] == "Must fit on historical data by calling fit() prior to " \ "calling stream(x)."
def get_TOS_loop(X, y, k_list, feature_list): # only compatible with pandas df_X = pd.DataFrame(X) result_loop = np.zeros([X.shape[0], len(k_list)]) roc_loop = [] prec_loop = [] for i in range(len(k_list)): k = k_list[i] clf = loop.LocalOutlierProbability(df_X, n_neighbors=k).fit() score_pred = clf.local_outlier_probabilities.astype(float) roc = np.round(roc_auc_score(y, score_pred), decimals=4) # apc = np.round(average_precision_score(y, score_pred), decimals=4) prec_n = np.round(get_precn(y, score_pred), decimals=4) print('LoOP @ {k} - ROC: {roc} Precision@n: {pren}'.format( k=k, roc=roc, pren=prec_n)) feature_list.append('loop_' + str(k)) roc_loop.append(roc) prec_loop.append(prec_n) result_loop[:, i] = score_pred print() return feature_list, roc_loop, prec_loop, result_loop
def test_small_cluster_size(): # Generate train/test data rng = check_random_state(2) X = 0.3 * rng.randn(120, 2) # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) X = np.r_[X, X_outliers] # Generate cluster labels a = [0] * 120 b = [1] * 18 cluster_labels = a + b with pytest.warns(UserWarning) as record: warnings.warn( "Number of neighbors specified larger than smallest cluster. Specify a number of neighbors smaller than the smallest cluster size (observations in smallest cluster minus one).", UserWarning) loop.LocalOutlierProbability(X, n_neighbors=50, cluster_labels=cluster_labels) # check that only one warning was raised assert len(record) == 1 # check that the message matches assert record[0].message.args[ 0] == "Number of neighbors specified larger than smallest cluster. Specify a number of neighbors smaller than the smallest cluster size (observations in smallest cluster minus one)."
def test_small_cluster_size(X_n140_outliers) -> None: # Generate cluster labels a = [0] * 120 b = [1] * 18 cluster_labels = a + b clf = loop.LocalOutlierProbability(X_n140_outliers, n_neighbors=50, cluster_labels=cluster_labels) with pytest.raises(SystemExit) as record_a, pytest.warns( UserWarning) as record_b: clf.fit() assert record_a.type == SystemExit # check that only one warning was raised assert len(record_b) == 1 # check that the message matches assert record_b[0].message.args[ 0] == "Number of neighbors specified larger than smallest " \ "cluster. Specify a number of neighbors smaller than " \ "the smallest cluster size (observations in smallest " \ "cluster minus one)."
def get_anomalies_by_LOop(graph, k_nn, threshold): graph_matrix = nx.to_numpy_matrix(graph) neigh = NearestNeighbors(n_neighbors=k_nn, metric='hamming') neigh.fit(graph_matrix) d, idx = neigh.kneighbors(graph_matrix, return_distance=True) m = loop.LocalOutlierProbability(distance_matrix=d, neighbor_matrix=idx, n_neighbors=k_nn).fit() scores = m.local_outlier_probabilities nodes = [] anomalies = [] ragular = [] for i, node in enumerate(graph.nodes): nodes.append(node) if scores[i] > threshold: anomalies.append(node) else: ragular.append(node) anomalies_edges = [] for i in range(len(anomalies)): for j in range(len(anomalies)): if i != j and anomalies[j] in nx.all_neighbors(graph, anomalies[i]) \ and not (anomalies[j], anomalies[i]) in anomalies_edges: anomalies_edges.append((anomalies[i], anomalies[j])) return anomalies_edges
def test_stream_cluster(): # Generate train/test data rng = check_random_state(2) X = 0.3 * rng.randn(120, 2) # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) X = np.r_[X, X_outliers] # Generate cluster labels a = [0] * 120 b = [1] * 18 cluster_labels = a + b # Fit the model X_train = X[0:138] X_test = X[139] clf = loop.LocalOutlierProbability(X_train, cluster_labels=cluster_labels).fit() with pytest.warns(UserWarning) as record: warnings.warn( "Stream approach does not support clustered data. Automatically refit using single cluster of points.", UserWarning) clf.stream(X_test) # check that only one warning was raised assert len(record) == 1 # check that the message matches assert record[0].message.args[ 0] == "Stream approach does not support clustered data. Automatically refit using single cluster of points."
def loOP(S, n_neighbours): X = np.array(S) m = loop.LocalOutlierProbability(X, extent=0.95, n_neighbors=n_neighbours).fit() scores = m.local_outlier_probabilities for i in scores: print(i) return scores
def test_missing_values(): X = np.array([1.3, 1.1, 0.9, 1.4, 1.5, np.nan, 3.2]) clf = loop.LocalOutlierProbability(X, n_neighbors=3) with pytest.raises(SystemExit) as record: clf.fit() assert record.type == SystemExit
def test_loop_dist_matrix(): # Generate train/test data rng = check_random_state(2) X = 0.3 * rng.randn(120, 2) # generate distance and neighbor indices neigh = NearestNeighbors(n_neighbors=10, metric='euclidean') neigh.fit(X) d, idx = neigh.kneighbors(X, return_distance=True) # fit loop using data and distance matrix clf1 = loop.LocalOutlierProbability(X) clf2 = loop.LocalOutlierProbability(distance_matrix=d, neighbor_matrix=idx) scores1 = clf1.fit().local_outlier_probabilities scores2 = clf2.fit().local_outlier_probabilities # compare the agreement between the results assert_almost_equal(scores1, scores2, decimal=1)
def loOP(train, extent = 2, n = 20): from PyNomaly import loop ## input: train data ## output: train column + 1 (LocalOutlierProbability) prob = loop.LocalOutlierProbability(train, extent=extent, n_neighbors=n).fit() scores = prob.local_outlier_probabilities.reshape(train.shape[0],1) return scores
def test_data_format() -> None: """ Test to ensure that a UserWarning is issued when the shape of the input data is not explicitly correct. This is corrected by the software when possible. :return: None """ X = [1.3, 1.1, 0.9, 1.4, 1.5, 3.2] clf = loop.LocalOutlierProbability(X, n_neighbors=3, use_numba=NUMBA) assert_warns(UserWarning, clf.fit)
def test_lambda_values(X_n140_outliers) -> None: # Fit the model with different extent (lambda) values clf1 = loop.LocalOutlierProbability(X_n140_outliers, extent=1) clf2 = loop.LocalOutlierProbability(X_n140_outliers, extent=2) clf3 = loop.LocalOutlierProbability(X_n140_outliers, extent=3) # predict scores (the lower, the more normal) score1 = clf1.fit().local_outlier_probabilities score2 = clf2.fit().local_outlier_probabilities score3 = clf3.fit().local_outlier_probabilities # Get the mean of all the scores score_mean1 = np.mean(score1) score_mean2 = np.mean(score2) score_mean3 = np.mean(score3) # check that expected the means align with expectation assert_greater(score_mean1, score_mean2) assert_greater(score_mean2, score_mean3)
def test_input_nodata(X_n140_outliers) -> None: with pytest.warns(UserWarning) as record: # attempt to fit loop without data or a distance matrix loop.LocalOutlierProbability(n_neighbors=X_n140_outliers.shape[0] - 1) # check that only one warning was raised assert len(record) == 1 # check that the message matches assert record[0].message.args[ 0] == "Data or a distance matrix must be provided."