def fit(self, X=None): """ INPUT: X X, n x p features variable. Will use values from .load_files() step (in self.X), if not supplied here. OUTPUT: None With data in self.X, this calls the Isolation Forest model to obtain anomaly scores. """ # if X != None: if X is not None: self.X = X self.iFmodel = isof.iForest(n_estimators=self.n_estimators, max_depth=self.max_depth) if self.show_calc_time: print "Constructing iTrees ..." start = time.time() self.iFmodel.fit(self.X) if self.show_calc_time: totsecs = time.time() - start mins = int(totsecs / 60) secs = totsecs - 60.0 * mins print "Elapsed time = {0} minutes, {1} seconds".format(mins, secs) return
def fit(self, X=None): """ INPUT: X X, n x p features variable. Will use values from .load_files() step (in self.X), if not supplied here. OUTPUT: None With data in self.X, this calls the Isolation Forest model to obtain anomaly scores. """ # if X != None: if X is not None: self.X = X self.iFmodel = isof.iForest(n_estimators=self.n_estimators, max_depth=self.max_depth) if self.show_calc_time: print "Constructing iTrees ..." start = time.time() self.iFmodel.fit(self.X) if self.show_calc_time: totsecs = time.time() - start mins = int(totsecs/60) secs = totsecs - 60.0*mins print "Elapsed time = {0} minutes, {1} seconds".format(mins, secs) return
def __init__(self, signal): data = self.feautre_extraction(signal) self.data = data bound1 = np.array( [np.amax(data, axis=0) * 2, np.amin(data, axis=0) * 2], dtype=np.float64).T self.iforest = iForest(data, bound1, 100)
def scatter_plot(X, Y, IDs, yname, xname, title, val): # Main Plot global plot_num plot_num = plot_num + 1 # fig = plt.figure(plot_num) # ax1 = fig.add_subplot(111) # ax1.set_xlabel(xname) # ax1.set_ylabel(yname) # plt.title(title) # # plt.ylim([min(Y) / 2.0, ceil(max(Y) * 2.0)]) # plt.xlim([min(X) / 2.0, ceil(max(X) * 2.0)]) # plt.loglog(X, Y, 'k.') construction_time, scoring_time = 0, 0 if algo_oddball: # Interpolate the median line minX = parametric_min(X, val) maxX = parametric_max(X, val) binedges = np.logspace(log10(minX), log10(maxX), 10) median_points_X = [] median_points_Y = [] median_points_X.append(minX) median_points_Y.append( get_median( [Y[j] for j in [ind for ind, x in enumerate(X) if x == minX]])) for i in xrange(1, 10): median_points_X.append( int(geometric_mean(binedges[i], binedges[i - 1]))) median_points_Y.append( get_median([ Y[j] for j in [ ind for ind, x in enumerate(np.digitize(X, binedges)) if x == i ] ])) if isnan(float(median_points_Y[-1])): median_points_Y.pop() median_points_X.pop() median_points_X.append(maxX) median_points_Y.append( get_median( [Y[j] for j in [ind for ind, x in enumerate(X) if x == maxX]])) plt.plot(median_points_X, median_points_Y, 'ro-') # Calculate Oddball Scores scores = oddball.get_scores(median_points_X, median_points_Y, X, Y, IDs) elif algo_iForests: features = combine_features([IDs, X, Y]) scores, construction_time, scoring_time = iForest(features) else: print_fail("Scoring Algorithm not Chosen") # Write rank and scores to outputfile return scores, construction_time, scoring_time
def scatter_plot(feature_X, feature_Y, make_plot=False): ids, data = combine_features([feature_X, feature_Y]) scores = iForest(ids, data) fig = plt.figure() if make_plot: X = feature_X.get_data() Y = feature_Y.get_data() ax = fig.add_subplot(111) ax.set_xlabel(feature_X.get_description(), fontsize=SIZES['label']) ax.set_ylabel(feature_Y.get_description(), fontsize=SIZES['label']) ax.xaxis.set_tick_params(labelsize=SIZES['tick']) ax.yaxis.set_tick_params(labelsize=SIZES['tick']) plt.gcf().subplots_adjust(bottom=0.18, left=0.18) if feature_X.get_log(): ax.set_xscale('log') plt.xlim([min(X) / 2.0, ceil(max(X) * 2.0)]) if feature_Y.get_log(): ax.set_yscale('log') plt.ylim([min(Y) / 2.0, ceil(max(Y) * 2.0)]) plt.plot(X, Y, 'k.') return fig, scores
discription[Y] + ' vs ' + discription[X], compare_value[X]) pp.savefig(fig) update_progress(i + 1, len(feature_pairs)) pp.close() scatter_plots = len(feature_pairs) print_ok('Scatter Plots Generated') """ PlotSPOT Algorithm """ # Get Outliers Scores if using iForests if generate_iForest: cprint("Generating Graph File") features = combine_features([ eval(F) for F in identity_features + continuous_features + discrete_features ]) iForest(features) print_ok("iForest Generation Complete") file = open(filefolder + "Log.txt", 'w') N_list = [10, 20, 50, 75, 100] for N_val in N_list: # Create graph between outliers and plots cprint("Generating Graph File") ranklist.generate_graph(P_val, N_val) print_ok("Graph File Generated") # Run plotSpot to get selected graphs Budget = [1, 2, 3, 4, 5, 6] for B in Budget: for algo in ["SpellOut", "Greedy", "G_Norm"]: print "N_val = ", N_val, " Budget = ", B, " ALGO = ", algo start_time = time.time()
def scatter_plot(X, Y, IDs, yname, xname, title, val): # Main Plot global plot_num plot_num = plot_num + 1 fig = plt.figure(plot_num) fig.text(0.5, 0.04, xname, ha='center') fig.text(0.04, 0.5, yname, va='center', rotation='vertical') ax1 = fig.add_subplot(111) plt.title(title) plt.ylim([min(Y) / 2.0, ceil(max(Y) * 2.0)]) plt.xlim([min(X) / 2.0, ceil(max(X) * 2.0)]) plt.loglog(X, Y, 'k.') if algo_oddball: # Interpolate the median line minX = parametric_min(X, val) maxX = parametric_max(X, val) binedges = np.logspace(log10(minX), log10(maxX), 10) median_points_X = [] median_points_Y = [] median_points_X.append(minX) median_points_Y.append( get_median( [Y[j] for j in [ind for ind, x in enumerate(X) if x == minX]])) for i in xrange(1, 10): median_points_X.append( int(geometric_mean(binedges[i], binedges[i - 1]))) median_points_Y.append( get_median([ Y[j] for j in [ ind for ind, x in enumerate(np.digitize(X, binedges)) if x == i ] ])) if isnan(float(median_points_Y[-1])): median_points_Y.pop() median_points_X.pop() median_points_X.append(maxX) median_points_Y.append( get_median( [Y[j] for j in [ind for ind, x in enumerate(X) if x == maxX]])) plt.plot(median_points_X, median_points_Y, 'ro-') # Calculate Oddball Scores scores = oddball.get_scores(median_points_X, median_points_Y, X, Y, IDs) elif algo_iForests: features = combine_features([IDs, X, Y]) scores = iForest(features) else: print_fail("Scoring Algorithm not Chosen") # Write rank and scores to outputfile write_to_file(scores, plot_num) # # Heat Map # heatmap, xedges ,yedges = np.histogram2d(X, Y, bins=(np.logspace(0, ceil(log10(max(X))), 400), np.logspace(0, ceil(log10(max(Y))), 250))) # extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] # ax2 = fig.add_subplot(212, sharex = ax1, sharey = ax1) # # plt.xscale('log'); plt.yscale('log') # cmap = plt.cm.jet # cmap.set_under(color = 'white') # plt.imshow(heatmap.T, extent = extent, origin='lower', aspect='auto', cmap = cmap, vmin = 0.01) # plt.ylim([min(Y),max(Y)]) # plt.xlim([min(X),max(X)]) return fig
def main(): # datasets = {'cardio.mat','shuttle.mat'} # datasets = {'thyroid.mat'} datasets = {'mnist.mat'} print('================================================================') print('> Loading Data ...') for mat_fname in datasets: X, y = load_from_mat(mat_fname) X = stats.zscore(X) print('> dataset: ', mat_fname) print( '================================================================') algorithms = { # 'KNN': neighbors.KNeighborsClassifier(), 'SVM': svm.SVC(gamma='auto'), # 'SVM_linear': svm.OneClassSVM(kernel='linear'), # 'RandomForest': ensemble.RandomForestClassifier(), # 'ExtraTrees': ensemble.ExtraTreesClassifier(n_estimators=100), # 'DBSCAN': cluster.DBSCAN(eps=1, min_samples=5), # 'iForest_create': iForest(with_replacement=True), 'OneClassSVM': svm.OneClassSVM(gamma='auto'), 'iForest_n': iForest(with_replacement=True, project_flag=True), 'iForest_create': iForest(with_replacement=True, project_flag=False), # 'iForest_paper': iForest(with_replacement=False), 'IsolationForest': ensemble.IsolationForest(contamination=0.2, behaviour='new'), 'LOF': neighbors.LocalOutlierFactor(contamination=0.2, novelty=True), } print('> Algorithms: {}'.format([i for i, j in algorithms.items()])) print('> Accuracy for iForest is not correct') for name, clf in algorithms.items(): print('---------------------------------------------') print('> {}'.format(name)) sum_accuracy = 0 auc_value = [] total_time = 0 ratio = 0.7 iteration = 5 for i in range(iteration): data = cross_validation(X, y, ratio) t0 = time.process_time() if ( # unsupervised method name == 'IsolationForest' or name == 'OneClassSVM' or name == 'LOF' or name == 'iForest_n' or name == 'iForest_create' or name == 'iForest_paper'): X_train = np.array(data['X_train']) y_train = np.array(data['y_train']) X_train = X_train[np.where(y_train == 1)] clf.fit(X_train) else: # supervised method. clf.fit(data['X_train'], data['y_train']) t1 = time.process_time() y_pred = clf.predict(data['X_test']) # if(name != 'KNN' and name != 'RandomForest'): # y_score = clf.decision_function(data['X_test']) y_real = data['y_test'] y_real = np.array(y_real) y_pred[np.where(y_pred == -1)] = 0 if ( # methods with decision functions and have ROC curves name == 'SVM' or name == 'SVM_linear' or name == 'LOF' or name == 'OneClassSVM' or name == 'IsolationForest' or name == 'iForest_n' or name == 'iForest_create' or name == 'iForest_paper'): y_score = np.asarray(clf.decision_function(data['X_test'])) if (name == 'iForest_n' or name == 'iForest_create'): y_score = -1 * y_score fpr, tpr, thresholds = metrics.roc_curve(data['y_test'], y_score, pos_label=1) roc_auc = metrics.auc(fpr, tpr) auc_value.append(roc_auc) total_time += (t1 - t0) print('\nAUC value is: ', np.mean(np.asarray(auc_value))) print('Accuracy: {}'.format(float(sum_accuracy) / iteration)) print('Training Time: {}'.format(float(total_time) / iteration)) print('---------------------------------------------') print('==============================================================')