def __init__(self, n_clusters=2, n_repeat=10, *anomaly_detector_params0, **anomaly_detector_params1): self.n_clusters = n_clusters self.n_repeat = n_repeat self.ad_parms0 = anomaly_detector_params0 self.ad_parms1 = anomaly_detector_params1 self.clf_ = None AnomalyDetector.__init__(self, *anomaly_detector_params0, **anomaly_detector_params1)
def fit_anomaly_detector(self, X, max_k=2, n_repeat=10, scores=None, use_k=None, init_clusters=None, verbose=False, scoring_method=mean): ''' The method uses a variation of the elbow curve to select the number of clusters based on the maximum individual anomaly score for each cluster computed by the make_scores method. The method is proposed at http://stackoverflow.com/questions/2018178/finding-the-best-trade-off-point-on-a-curve :param X: data set to fit, 2 dim numpy array or a DataObject with a class column :param n_repeat: repeat clustering n number of times using the mean as elbow curve :param scores: an array of arrays with anomaly scores, to use for fitting instead of calling make_scores. Each array corresponds contains 1 or more calls to make_scores for a k number of clusters. For each index i in range(len(scores)) means k=i+1. :param use_k: if set, no autmatic selection is made, instead the value of use_k is used as number of clusters :param init_clusters: the clustering is initialized with the provided clusters. scores, use_k and n_repeat are ignored. :param verbose: print progress info :param scoring_method: the method to compute the aggregated evaluation score from teh anomaly scores of the data clustering, e.g. numpy mean or std. :return: the anomaly detector with parameters provided in the constructor fitted to the data with the best number of clusters tested ''' best_k = None if init_clusters is None: if use_k is None: ss = [] if scores is None: for n in xrange(n_repeat): self.make_scores(X, max_k,verbose=verbose) ss.append(map(lambda s: scoring_method(s[s<inf]), self.scores)) else: ss = [map(lambda s: scoring_method(s[s<inf]), scores[i]) for i in range(len(scores))] best_k, y = self.compute_best_elbow_k(ss) else: best_k = use_k ad_list = [] scores = [] for i in range(n_repeat): ad = AnomalyDetector(*self.ad_parms0, **self.ad_parms1) self.clustering_ = self._train_clf(ad, X, best_k,init_clusters,verbose=verbose) ad_scores = ad.anomaly_score(X, self.clustering_) scores.append(ad_scores[ad_scores < inf].std()) ad_list.append(ad) if init_clusters is not None: break best_ad = ad_list[argmin(scores)] if verbose: print "best k", best_k if use_k or init_clusters is not None or y is not None: self.cluster_curve_ = None else: self.cluster_curve_ = y return ad
def fit_anomaly_detector(self, data_object, poisson_onesided=True): if poisson_onesided: anomaly_detector = AnomalyDetector([ P_PoissonOnesided(self.root_column + i, self.period_column) for i in xrange(self.num_of_event_columns) ]) else: anomaly_detector = AnomalyDetector([ P_Poisson(self.root_column + i, self.period_column) for i in xrange(self.num_of_event_columns) ]) self._anomaly_detector = anomaly_detector.fit(data_object) return anomaly_detector
def fit_anomaly_detector(self, data_object, poisson_onesided=True): if poisson_onesided: anomaly_detector = AnomalyDetector([ P_PoissonOnesided(self.root_column+i, self.period_column) for i in xrange(self.num_of_event_columns) ]) else: anomaly_detector = AnomalyDetector([ P_Poisson(self.root_column+i, self.period_column) for i in xrange(self.num_of_event_columns) ]) self._anomaly_detector = anomaly_detector.fit(data_object) return anomaly_detector
def make_scores(self,X, max_k, start_k=2,verbose=False): ''' Returns an array of the individual anomaly scores for each example for number of each clusters. :param X: array of arrays or DataObject :param max_k: maximum number of clusters :param start_k: start clustering start_k to max_k (inclusive) number of clusters. :return: array of array of anomaly scores for each k from 1 to max_k (inclusive) ''' ad = AnomalyDetector(*self.ad_parms0, **self.ad_parms1) ad.fit(X) score = ad.anomaly_score(X) scores = [list(score)] if verbose: print "Clusters", 1, "Score", score[score < inf].std(), sum(score == inf) min_percentile = percentile(score,20) max_percentile = percentile(score,80) for k in range(start_k,max_k+1): clusters = self._train_clf(ad, X, k,verbose=verbose, marked_as_single_cluster=[s >= min_percentile and s <= max_percentile for s in score]) score = ad.anomaly_score(X, clusters) scores += [list(score)] if verbose: print "Clusters", k, "Score", score[score < inf].std(), sum(score == inf) self.scores = array(scores) return self.scores
def test_conditional_gaussian_dependency_matrix(self): length = 100 n_samples = 1000 X = array([sample_markov_chain(length) for _ in range(n_samples)]) # Next two should be equal s0 = AnomalyDetector( P_ConditionalGaussianDependencyMatrix( range(length), length)).fit(X).anomaly_score(X) ad1 = AnomalyDetector( P_ConditionalGaussianCombiner([ P_ConditionalGaussian([i + 1], [i]) for i in range(length - 1) ] + [P_ConditionalGaussian([0], [])]), cr_plus).fit(X) s1 = ad1.anomaly_score(X) assert_allclose(s0, s1, rtol=0.0001) # OK # Most likely, these two are not equal but highly correlated ad2 = AnomalyDetector( [P_ConditionalGaussian([i], []) for i in range(length)], cr_plus).fit(X) s2 = ad2.anomaly_score(X) ad3 = AnomalyDetector( P_ConditionalGaussianCombiner( [P_ConditionalGaussian([i], []) for i in range(length)]), cr_plus).fit(X) s3 = ad3.anomaly_score(X) assert_equal(pearsonr(s2, s3) > 0.985, True) # Test classification Y = array([sample_markov_chain(length, 0.2) for _ in range(n_samples)]) Z = array([sample_markov_chain(length, 0.3) for _ in range(n_samples)]) data = r_[X, Y, Z] labels = r_[['X'] * len(X), ['Y'] * len(Y), ['Z'] * len(Z)] data_index = shuffle(range(len(data))) training_set = data_index[:n_samples * 2] test_set = data_index[n_samples * 2:] models = { 'independent gaussian': AnomalyDetector([P_Gaussian([i]) for i in range(length)], cr_plus), 'independent conditional gaussian': AnomalyDetector( [P_ConditionalGaussian([i], []) for i in range(length)], cr_plus), 'independent conditional gaussian with combiner': AnomalyDetector( P_ConditionalGaussianCombiner( [P_ConditionalGaussian([i], []) for i in range(length)])), 'single conditional gaussian with combiner': AnomalyDetector( P_ConditionalGaussianCombiner([ P_ConditionalGaussian([i], [i - 1]) for i in range(1, length) ] + [P_ConditionalGaussian([0], [])])), 'dependency matrix': AnomalyDetector( P_ConditionalGaussianDependencyMatrix(range(length), length)) } all_acc = {} for key in models: ad = models[key].fit(data[training_set], labels[training_set]) adclf = SklearnClassifier.clf(ad) labels_predicted = adclf.predict(data[test_set]) accuracy = sum(labels[test_set] == labels_predicted) / float( len(test_set)) all_acc[key] = accuracy print key, "accuracy = ", accuracy assert_close(all_acc['independent gaussian'], all_acc['independent conditional gaussian'], decimal=2) assert_close(all_acc['independent gaussian'], all_acc['independent conditional gaussian with combiner'], decimal=2) assert_close(all_acc['single conditional gaussian with combiner'], all_acc['dependency matrix'], decimal=2)
def test_conditional_gaussian(self): x = array([[x0] for x0 in norm(0, 1).rvs(1000)]) gauss_scores = AnomalyDetector(P_Gaussian(0)).fit(x).anomaly_score(x) condgauss_scores = \ AnomalyDetector(P_ConditionalGaussian([0], [])). \ fit(x). \ anomaly_score(x) assert_allclose(gauss_scores, condgauss_scores, atol=0.01, rtol=0.01) X = array( [[x0, x1] for x0, x1 in zip(norm(0, 1).rvs(1000), norm(0, 1).rvs(1000))]) gauss_scores_X = AnomalyDetector(P_Gaussian( [0])).fit(X).anomaly_score(X) condgauss_scores_X = \ AnomalyDetector(P_ConditionalGaussian([0],[1])). \ fit(X). \ anomaly_score(X) assert_allclose(gauss_scores_X, condgauss_scores_X, atol=0.3) X = array( [[x0, x0 + 0.1 * x1] for x0, x1 in zip(norm(0, 1).rvs(1000), norm(0, 1).rvs(1000))]) # This is not equal at all gauss_scores_X = AnomalyDetector(P_Gaussian( [0, 1])).fit(X).anomaly_score(X) condgauss_scores_X = \ AnomalyDetector(P_ConditionalGaussian([0,1],[])). \ fit(X). \ anomaly_score(X) assert_equal((pearsonr(gauss_scores_X, condgauss_scores_X) > 0.994), True) assert_allclose(gauss_scores_X, condgauss_scores_X, atol=2) # Very bad X = array([[x0, x0 + 0.1 * x1, x2] for x0, x1, x2 in c_[norm(0, 1).rvs(1000), norm(0, 1).rvs(1000), norm(0, 1).rvs(1000)]]) # This is not equal at all gauss_scores_X = AnomalyDetector(P_Gaussian( [0, 1])).fit(X).anomaly_score(X) condgauss_scores_X = \ AnomalyDetector(P_ConditionalGaussian([0, 1], [])). \ fit(X). \ anomaly_score(X) assert_equal((pearsonr(gauss_scores_X, condgauss_scores_X) > 0.994), True) assert_allclose(gauss_scores_X, condgauss_scores_X, atol=2) # Very bad X = array([[x0, x0 + 0.1 * x1, x2] for x0, x1, x2 in c_[norm(0, 1).rvs(1000), norm(0, 1).rvs(1000), norm(0, 1).rvs(1000)]]) # This is not equal at all gauss_scores_X = AnomalyDetector(P_Gaussian( [0, 1, 2])).fit(X).anomaly_score(X) condgauss_scores_X = \ AnomalyDetector( P_ConditionalGaussianCombiner([ P_ConditionalGaussian([0], [1,2]), P_ConditionalGaussian([1], [2]), P_ConditionalGaussian([2], []), ])). \ fit(X). \ anomaly_score(X) assert_equal((pearsonr(gauss_scores_X, condgauss_scores_X) > 0.98), True) assert_allclose(gauss_scores_X, condgauss_scores_X, atol=5) # Very bad # This is very much equal gauss_scores_X = AnomalyDetector(P_ConditionalGaussian( [0, 1, 2], [])).fit(X).anomaly_score(X) condgauss_scores_X = \ AnomalyDetector( P_ConditionalGaussianCombiner([ P_ConditionalGaussian([0], [1, 2]), P_ConditionalGaussian([1], [2]), P_ConditionalGaussian([2], []), ])). \ fit(X). \ anomaly_score(X) assert_allclose(gauss_scores_X, condgauss_scores_X, atol=0.001) # If we combine them using a ordinary combination rule by adding anomaly score together condgauss_scores_X2 = \ AnomalyDetector( [ P_ConditionalGaussian([0], [1, 2]), P_ConditionalGaussian([1], [2]), P_ConditionalGaussian([2], []), ], cr_plus). \ fit(X). \ anomaly_score(X) assert_equal( (pearsonr(condgauss_scores_X, condgauss_scores_X2) > 0.99), True) # Good assert_allclose(condgauss_scores_X2, condgauss_scores_X, atol=2) # Bad # ad1 = AnomalyDetector([P_Gaussian([i]) for i in range(len(X[0]))], cr_plus).fit(X) s1 = ad1.anomaly_score(X) ad2 = AnomalyDetector( [P_ConditionalGaussian([i], []) for i in range(len(X[0]))], cr_plus).fit(X) s2 = ad2.anomaly_score(X) print "r:", pearsonr(s1, s2) assert_allclose(s1, s2, rtol=0.01) # OK
def _detector_fit(self, X, y): return AnomalyDetector.fit(self, X, y)
def _create_detector(self, *ad_parms0, **ad_parms1): return AnomalyDetector(*ad_parms0, **ad_parms1)
def loglikelihood(self, X, y=None): return AnomalyDetector.loglikelihood( self, X, self.clf_.predict(X) if self.clf_ is not None and y is None else y)
def anomaly_score(self, X, y=None): return AnomalyDetector.anomaly_score( self, X, self.clf_.predict(X) if self.clf_ is not None and y is None else y)
def test_conditional_gaussian(self): x = array([[x0] for x0 in norm(0,1).rvs(1000)]) gauss_scores = AnomalyDetector(P_Gaussian(0)).fit(x).anomaly_score(x) condgauss_scores = \ AnomalyDetector(P_ConditionalGaussian([0], [])). \ fit(x). \ anomaly_score(x) assert_allclose(gauss_scores, condgauss_scores,atol=0.01,rtol=0.01) X = array([[x0, x1] for x0,x1 in zip(norm(0, 1).rvs(1000), norm(0, 1).rvs(1000)) ]) gauss_scores_X = AnomalyDetector(P_Gaussian([0])).fit(X).anomaly_score(X) condgauss_scores_X = \ AnomalyDetector(P_ConditionalGaussian([0],[1])). \ fit(X). \ anomaly_score(X) assert_allclose(gauss_scores_X, condgauss_scores_X, atol=0.3) X = array([[x0, x0+0.1*x1] for x0,x1 in zip(norm(0, 1).rvs(1000), norm(0, 1).rvs(1000)) ]) # This is not equal at all gauss_scores_X = AnomalyDetector(P_Gaussian([0,1])).fit(X).anomaly_score(X) condgauss_scores_X = \ AnomalyDetector(P_ConditionalGaussian([0,1],[])). \ fit(X). \ anomaly_score(X) assert_equal((pearsonr(gauss_scores_X, condgauss_scores_X) > 0.994), True) assert_allclose(gauss_scores_X, condgauss_scores_X, atol=2) # Very bad X = array([[x0, x0 + 0.1 * x1, x2] for x0, x1, x2 in c_[norm(0, 1).rvs(1000), norm(0, 1).rvs(1000), norm(0, 1).rvs(1000)]]) # This is not equal at all gauss_scores_X = AnomalyDetector(P_Gaussian([0, 1])).fit(X).anomaly_score(X) condgauss_scores_X = \ AnomalyDetector(P_ConditionalGaussian([0, 1], [])). \ fit(X). \ anomaly_score(X) assert_equal((pearsonr(gauss_scores_X, condgauss_scores_X) > 0.994), True) assert_allclose(gauss_scores_X, condgauss_scores_X, atol=2) # Very bad X = array( [[x0, x0 + 0.1 * x1, x2] for x0, x1, x2 in c_[norm(0, 1).rvs(1000), norm(0, 1).rvs(1000), norm(0, 1).rvs(1000)]]) # This is not equal at all gauss_scores_X = AnomalyDetector(P_Gaussian([0, 1,2])).fit(X).anomaly_score(X) condgauss_scores_X = \ AnomalyDetector( P_ConditionalGaussianCombiner([ P_ConditionalGaussian([0], [1,2]), P_ConditionalGaussian([1], [2]), P_ConditionalGaussian([2], []), ])). \ fit(X). \ anomaly_score(X) assert_equal((pearsonr(gauss_scores_X, condgauss_scores_X) > 0.98), True) assert_allclose(gauss_scores_X, condgauss_scores_X, atol=5) # Very bad # This is very much equal gauss_scores_X = AnomalyDetector(P_ConditionalGaussian([0, 1, 2], [])).fit(X).anomaly_score(X) condgauss_scores_X = \ AnomalyDetector( P_ConditionalGaussianCombiner([ P_ConditionalGaussian([0], [1, 2]), P_ConditionalGaussian([1], [2]), P_ConditionalGaussian([2], []), ])). \ fit(X). \ anomaly_score(X) assert_allclose(gauss_scores_X, condgauss_scores_X, atol=0.001) # If we combine them using a ordinary combination rule by adding anomaly score together condgauss_scores_X2 = \ AnomalyDetector( [ P_ConditionalGaussian([0], [1, 2]), P_ConditionalGaussian([1], [2]), P_ConditionalGaussian([2], []), ], cr_plus). \ fit(X). \ anomaly_score(X) assert_equal((pearsonr(condgauss_scores_X, condgauss_scores_X2) > 0.99), True) # Good assert_allclose(condgauss_scores_X2, condgauss_scores_X, atol=2) # Bad # ad1 = AnomalyDetector( [P_Gaussian([i]) for i in range(len(X[0]))], cr_plus ).fit(X) s1 = ad1.anomaly_score(X) ad2 = AnomalyDetector( [P_ConditionalGaussian([i], []) for i in range(len(X[0]))], cr_plus ).fit(X) s2 = ad2.anomaly_score(X) print("r:", pearsonr(s1,s2)) assert_allclose(s1, s2, rtol=0.01) # OK
def test_conditional_gaussian_dependency_matrix(self): length = 100 n_samples = 1000 X = array([sample_markov_chain(length) for _ in range(n_samples)]) # Next two should be equal s0 = AnomalyDetector( P_ConditionalGaussianDependencyMatrix(list(range(length)),length) ).fit(X).anomaly_score(X) ad1=AnomalyDetector( P_ConditionalGaussianCombiner([P_ConditionalGaussian([i + 1], [i]) for i in range(length - 1)]+[P_ConditionalGaussian([0], [])]), cr_plus ).fit(X) s1 = ad1.anomaly_score(X) assert_allclose(s0, s1, rtol=0.0001) # OK # Most likely, these two are not equal but highly correlated ad2=AnomalyDetector( [P_ConditionalGaussian([i], []) for i in range(length)], cr_plus ).fit(X) s2 = ad2.anomaly_score(X) ad3=AnomalyDetector( P_ConditionalGaussianCombiner([P_ConditionalGaussian([i], []) for i in range(length)]), cr_plus ).fit(X) s3 = ad3.anomaly_score(X) assert_equal(pearsonr(s2,s3)> 0.985, True) # Test classification Y = array([sample_markov_chain(length,0.2) for _ in range(n_samples)]) Z = array([sample_markov_chain(length,0.3) for _ in range(n_samples)]) data = r_[X,Y,Z] labels = r_[['X']*len(X), ['Y']*len(Y), ['Z']*len(Z)] data_index = shuffle(list(range(len(data)))) training_set = data_index[:n_samples*2] test_set = data_index[n_samples*2:] models = { 'independent gaussian': AnomalyDetector([P_Gaussian([i]) for i in range(length)],cr_plus), 'independent conditional gaussian': AnomalyDetector([P_ConditionalGaussian([i], []) for i in range(length)],cr_plus), 'independent conditional gaussian with combiner': AnomalyDetector(P_ConditionalGaussianCombiner([P_ConditionalGaussian([i], []) for i in range(length)])), 'single conditional gaussian with combiner': AnomalyDetector(P_ConditionalGaussianCombiner([P_ConditionalGaussian([i], [i-1]) for i in range(1, length)]+ [P_ConditionalGaussian([0], [])])), 'dependency matrix': AnomalyDetector(P_ConditionalGaussianDependencyMatrix(list(range(length)),length)) } all_acc = {} for key in models: ad=models[key].fit(data[training_set], labels[training_set]) adclf = SklearnClassifier.clf(ad) labels_predicted = adclf.predict(data[test_set]) accuracy = sum(labels[test_set]==labels_predicted)/float(len(test_set)) all_acc[key] = accuracy print(key, "accuracy = ", accuracy) assert_close(all_acc['independent gaussian'],all_acc['independent conditional gaussian'],decimal=2) assert_close(all_acc['independent gaussian'], all_acc['independent conditional gaussian with combiner'],decimal=2) assert_close(all_acc['single conditional gaussian with combiner'], all_acc['dependency matrix'],decimal=2)