def compute_print_scores(normal_users, queue): K_GMM_n, K_KMeans_n, K_GMM_s, K_KMeans_s = Ks print 'novelty score GMM' B = GMM(covariance_type='full', n_components = 1) B.fit(queue) x = [B.score([i]).mean() for i in queue] print get_score_last_item(x, K_GMM_n) print 'novelty score OneClassSVM' x = anom_one_class(queue, [queue[-1]]) print x[-1] print 'novelty score LSA' anomalymodel = lsanomaly.LSAnomaly() X = np.array(queue) anomalymodel.fit(X) print anomalymodel.predict(np.array([queue[-1]])) print 'novelty score degree K_means' K = KMeans(n_clusters=1) K.fit(queue) x = [K.score([i]) for i in queue] print get_score_last_item(x, K_KMeans_n) normal_and_new = normal_users + [queue[-1]] print 'degree of belonging to known class GMM' B = GMM(covariance_type='full', n_components = 1) B.fit(normal_users) x = [B.score([i]).mean() for i in normal_and_new] print get_score_last_item(x, K_GMM_s) print 'degree of belonging to known class OneClassSVM' x = anom_one_class(normal_users, [queue[-1]]) print x[-1] print 'degree of belonging to known class LSA' anomalymodel = lsanomaly.LSAnomaly() X = np.array(normal_users) anomalymodel.fit(X) print anomalymodel.predict(np.array([queue[-1]])) print 'degree of belonging to known class K_means' K = KMeans(n_clusters=1) K.fit(normal_users) x = [K.score([i]) for i in normal_and_new] print get_score_last_item(x, K_KMeans_s)
def plot_results(X, xx, yy, threshold=0.5, sigma_candidates=None, rho_candidates=None): _ = plt.figure(figsize=(16, 10)) for row, sigma in enumerate(sigma_candidates): for col, rho in enumerate(rho_candidates): # Train the anomaly model clf = lsanomaly.LSAnomaly(sigma=sigma, rho=rho) clf.fit(X) # Get anomaly scores across the grid Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) # Plot the training data, anomaly model response and decision # boundary at threshold 0.5. subplot = plt.subplot(len(sigma_candidates), len(rho_candidates), row * 3 + col + 1) plt.contourf( xx, yy, Z, levels=np.linspace(0, 1, 11), cmap=plt.cm.get_cmap("GnBu"), ) subplot.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors="red") cb = plt.colorbar() for t in cb.ax.get_yticklabels(): t.set_fontsize(10) plt.scatter(X[:, 0], X[:, 1], c="black", marker="+", s=50, linewidth=2) subplot.set_title( "$\sigma = $ %.3g, $\\rho$ = %.3g" % (sigma, rho), fontsize=14, usetex=True, ) subplot.axes.get_xaxis().set_ticks([]) subplot.axes.get_yaxis().set_ticks([]) plt.xlim((-7, 7)) plt.ylim((-7, 7)) plt.show()
def train_with_lsanomaly(self, trainX, testX): anomalymodel = lsanomaly.LSAnomaly() anomalymodel.fit(trainX) y_pred_train = anomalymodel.predict(trainX) y_pred_test = anomalymodel.predict(testX) # Process results self.replace_in_list(y_pred_train, 'anomaly', -1) self.replace_in_list(y_pred_test, 'anomaly', -1) n_error_train = y_pred_train.count(-1) n_error_test = y_pred_test.count(-1) return y_pred_train, y_pred_test, n_error_train, n_error_test
def use_model(model, df_list, x_columns, params): predicted = [] if model == 'knn': neigh = NearestNeighbors(n_neighbors=params['n'], p=params['p']) neigh.fit(df_list[0][x_columns]) for i in range(len(df_list)): pred = neigh.kneighbors(df_list[i][x_columns]) pred = [np.mean(i) for i in pred[0]] predicted.append(pred) elif model == 'svm': svm = OneClassSVM(kernel=params['kernel']) svm.fit(df_list[0][x_columns]) for i in range(len(df_list)): pred = svm.score_samples(df_list[i][x_columns]) maximum = max(pred) pred = [(x * -1) + maximum for x in pred] predicted.append(pred) elif model == 'ísolationForest': clf = IsolationForest(n_estimators=params['n_estimators'], random_state=0) clf.fit(df_list[0][x_columns]) for i in range(len(df_list)): pred = clf.score_samples(df_list[i][x_columns]) pred = list(map(abs, pred)) predicted.append(pred) elif model == 'autoencoder': clf = AutoEncoder(hidden_neurons=params['hidden_neurons'], verbose=0, random_state=0) clf.fit(df_list[0][x_columns]) for i in range(len(df_list)): pred = clf.decision_function(df_list[i][x_columns]) predicted.append(pred) elif model == 'lsanomaly': anomalymodel = lsanomaly.LSAnomaly(sigma=params['sigma'], rho=params['rho']) anomalymodel.fit(df_list[0][x_columns].to_numpy()) for i in range(len(df_list)): pred = anomalymodel.predict_proba(df_list[i][x_columns].to_numpy()) pred = [a[1] for a in pred] predicted.append(pred) return predicted
digits = datasets.load_digits() X = digits.data y = digits.target # Split data into training and test sets, then remove all examples of # class 9 from the training set, leaving only examples of 0-8. X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.5) train_inlier_idx = y_train < 9 X_train = X_train[train_inlier_idx, :] y_train = y_train[train_inlier_idx] # Fit the model for inlier classes anomalymodel = lsanomaly.LSAnomaly() anomalymodel.fit(X_train, y_train) # Use the outlier score as a prediction of whether each test point # belongs to class 9, for which no training data was given. predictions = anomalymodel.predict_proba(X_test) fpr, tpr, thresholds = metrics.roc_curve(y_test == 9, predictions[:, -1]) print('AUC=%f' % (metrics.auc(fpr, tpr))) # Try to assign each test point to classes 0-9, given only test data # for classes 0-8. y_pred = anomalymodel.predict(X_test) y_pred = [w if np.isreal(w) else 9 for w in y_pred] print 'Confusion matrix for all classes:' print metrics.confusion_matrix(y_test, y_pred)
def evaluate( X_train, y_train, X_test, y_test, outlier_class, method_name, current_method_aucs, sigma, rho=0.1, nu=0.5, ): """ Evaluation for a method and data set. Calculates the AUC for a single evaluation fold. Args: X_train (numpy.ndarray): independent training variables y_train (numpy.ndarray): training labels X_test (numpy.ndarray): independent test variables y_test (numpy.ndarray): test labels outlier_class (int): index of the outlier class method_name (str): method being run current_method_aucs (list): input to the *results* dictionary sigma (float): kernel lengthscale for LSAD and OCSVM rho (float): smoothness parameter for LSAD nu (float): OCSVM parameter - see *scikit-learn* documentation Raises: ValueError: if a `NaN` is encountered in the AUC calculation. """ try: if method_name == "LSAD": lsanomaly_model = lsanomaly.LSAnomaly(n_kernels_max=500, gamma=sigma**-2, rho=rho) lsanomaly_model.fit(X_train, y_train) predictions = lsanomaly_model.predict_proba(X_test)[:, -1] elif method_name == "OCSVM": svm_anomaly_model = svm.OneClassSVM(gamma=sigma**-2, nu=nu) svm_anomaly_model.fit(X_train) predictions = 1 - svm_anomaly_model.decision_function(X_test) elif method_name == "KNN": anomaly_model = neighbors.NearestNeighbors(10) anomaly_model.fit(X_train) dists, idx = anomaly_model.kneighbors(X_test) predictions = dists[:, -1] elif method_name == "KM": km = cluster.KMeans(min(X_train.shape[0], 20)) km.fit(X_train) nn = neighbors.NearestNeighbors(1) nn.fit(km.cluster_centers_) dists, idx = nn.kneighbors(X_test) predictions = dists[:, 0] else: raise ValueError("unknown method: {}".format(method_name)) fpr, tpr, thresholds = metrics.roc_curve(y_test == outlier_class, predictions) metric_auc = metrics.auc(fpr, tpr) logger.debug("\tAUC: {:>6.4f}".format(metric_auc)) if not math.isnan(metric_auc): current_method_aucs.append(metric_auc) else: raise ValueError("NaN encountered in {}".format(method_name)) except (IndexError, ValueError, Exception) as e: logger.exception("\t{} {}: {}".format(method_name, type(e), str(e)), exc_info=True) raise
def compute_scores(normal_users, queue, Ks=[]): ''' Calculates the novelty scores (noise and strangeness) for the 4 algotithms Receives the list of normal users and the queue (all users) and the list of curiosity factors Ks Updates the global variables GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s with the results ''' global GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s, K_s #Novelty Scores for each algorithm, those ''_n are for noise score, ''_s are for strangeness score GMM_n = [] one_n = [] lsa_n = [] K_n = [] GMM_s = [] one_s = [] lsa_s = [] K_s = [] K_GMM_n, K_KMeans_n, K_GMM_s, K_KMeans_s = Ks #K_GMM_n, K_KMeans_n are the noise curiosity factors for each algorithm #K_GMM_s, K_KMeans_s are the strangeness curiosity factors for each algorithm #Ks is a list containing the 4 above mentioned parameters ''' For One_class_SVM and LSA, when asked to predict the new entry, a label is directly returned LSA: 'anomaly' or '0' (normal) One One_class_SVM: -1 (anomaly) or 1 (normal) GMM and K means predict a fitting score. The novelty score is obtained calculating the zscore of the entry compared with the scores of all other entries, calling the function get_score_last_item If the zscore returned >= 1 the new entry is anomalous ''' ''' Noise scores are computed with the queue as the base of knowledge, fitting all the entries but the last to the algorithm ''' B = GMM(covariance_type='full', n_components = 1) B.fit(queue[0:-1]) x = [B.score([i]).mean() for i in queue] GMM_n.append(get_score_last_item(x, K_GMM_n)) K = KMeans(n_clusters=1) K.fit(queue[0:-1]) x = [K.score([i]) for i in queue] K_n.append(get_score_last_item(x, K_KMeans_n)) oneClassSVM = OneClassSVM(nu=0.1) oneClassSVM.fit(queue[0:-1]) x = oneClassSVM.predict(np.array([queue[-1]])) if x == -1: one_n.append(1) if x == 1: one_n.append(0) X = np.array(queue[0:-1]) anomalymodel = lsanomaly.LSAnomaly() anomalymodel.fit(X) x = anomalymodel.predict(np.array([queue[-1]])) if x == ['anomaly']: lsa_n.append(1) if x == [0]: lsa_n.append(0) ''' Strangeness scores are computed with the normal users as the base of knowledge, fitting normal users to the algorithm ''' normal_and_new = normal_users + [queue[-1]] #List to be passed to get_score_last_item to calculate the zscore of the last item, the new entry B = GMM(covariance_type='full', n_components = 1) B.fit(normal_users) x = [B.score([i]).mean() for i in normal_and_new] GMM_s.append(get_score_last_item(x, K_GMM_s)) K = KMeans(n_clusters=1) K.fit(normal_users) x = [K.score([i]) for i in normal_and_new] K_s.append(get_score_last_item(x, K_KMeans_s)) oneClassSVM = OneClassSVM(nu=0.1) oneClassSVM.fit(normal_users) x = oneClassSVM.predict(np.array([queue[-1]])) if x == -1: one_s.append(1) if x == 1: one_s.append(0) anomalymodel = lsanomaly.LSAnomaly() X = np.array(normal_users) anomalymodel.fit(X) x = anomalymodel.predict(np.array([queue[-1]])) if x == ['anomaly']: lsa_s.append(1) if x == [0]: lsa_s.append(0) return GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s, K_s
def eval( X_train, y_train, X_test, y_test, outlier_class, method, method_name, current_method_aucs, sigma, rho=0.1, nu=0.5, ): predictions = None try: if method_name == "LSAD": lsanomaly_model = lsanomaly.LSAnomaly(n_kernels_max=500, gamma=sigma**-2, rho=rho) lsanomaly_model.fit(X_train, y_train) predictions = lsanomaly_model.predict_proba(X_test)[:, -1] elif method_name == "OCSVM": svm_anomaly_model = svm.OneClassSVM(gamma=sigma**-2, nu=nu) svm_anomaly_model.fit(X_train) predictions = 1 - svm_anomaly_model.decision_function(X_test) elif method_name == "KNN": anomaly_model = neighbors.NearestNeighbors(10) anomaly_model.fit(X_train) dists, idx = anomaly_model.kneighbors(X_test) predictions = dists[:, -1] elif method_name == "KM": km = cluster.KMeans(min(X_train.shape[0], 20)) km.fit(X_train) nn = neighbors.NearestNeighbors(1) nn.fit(km.cluster_centers_) dists, idx = nn.kneighbors(X_test) predictions = dists[:, 0] elif method_name == "DBS": dbs_anomaly_model = cluster.DBSCAN(eps=sigma, min_samples=3) clusters = dbs_anomaly_model.fit_predict(X_test) else: raise ValueError("unknown method: {}".format(method_name)) fpr, tpr, thresholds = metrics.roc_curve(y_test == outlier_class, predictions) metric_auc = metrics.auc(fpr, tpr) logger.debug("\tAUC: {:>6.4f}".format(metric_auc)) if not math.isnan(metric_auc): current_method_aucs.append(metric_auc) else: raise ValueError("NaN encountered in {}".format(method_name)) except (IndexError, Exception) as e: logger.exception( "\t{} {}: {}".format(method_name, type(e), str(e)), exc_info=True, ) raise
Y_test = Y2[0:319, :] # In[159]: X_train = X2[:5000, :] X_test = X2[10000:15000, :] # In[160]: plt.plot(X_test[:, 0]) plt.plot(X_test[:, 1]) # In[161]: # Train the model anomalymodel = lsanomaly.LSAnomaly(rho=1, sigma=.5) anomalymodel.fit(Y_train) # Predict anomalies statically (assuming iid samples) y_pred_static = anomalymodel.predict_proba(Y_test) # Predict anomalies sequentially (assume known transition matrix and # initial probabilities) A = np.array([[.999, .001], [.01, .99]]) pi = np.array([.5, .5]) y_pred_dynamic = anomalymodel.predict_sequence(Y_test, A, pi) # In[162]: plt.clf() plt.figure(figsize=(10, 6))
def generate_neural_classifier(data_list): x_train = np.array(data_list) clf = lsanomaly.LSAnomaly() clf.fit(x_train) return clf