def do_one_clustering(df, gmms): df_train = copy.deepcopy(df) proj = reduction.gmm_reduction(df_train, headers, gmms) cproj = copy.deepcopy(proj) A = affinity.get_affinity_matrix(cproj, metric_method=distance.cosdist, knn=8) sc = SpectralClustering(n_clusters=k, affinity="precomputed", assign_labels="kmeans").fit(A) res = sc.labels_ return res, cproj
df_training_20, df_training_full, gmms_training_20, gmms_training_full = preprocessing.get_preprocessed_training_data() df_test_20, df_test_full, gmms_test_20, gmms_test_full = preprocessing.get_preprocessed_test_data() df = df_training_20 gmms = gmms_training_20 df = df[0:1000] df_train = copy.deepcopy(df) df_train.drop('attack',1,inplace=True) df_train.drop('difficulty',1,inplace=True) headers.remove('protocol_type') headers.remove('attack') headers.remove('difficulty') print "reductioning..." proj = reduction.gmm_reduction(df_train, headers, gmms) print "plotting..." true_labels = [] for i in range( len(attacks) ): true_labels.append([]) attacks = df["attack"].values.tolist() for i, d in enumerate(proj): true_labels[attacks[i]].append(d) # title for the plots titles = ['Normal data', 'Abnormal data', 'Data']
def test_clustering(df, gmms, title="", save_to_file=False, highlight_point=None): df_train = copy.deepcopy(df) df_train.drop('attack',1,inplace=True) df_train.drop('difficulty',1,inplace=True) # from about 30 dimension to 2 dimension proj = reduction.gmm_reduction(df_train, headers, gmms) cproj = copy.deepcopy(proj) # data_per_true_labels : try to make sort of dictionary per each label data_per_true_labels = [] for i in range( len(attacks) ): data_per_true_labels.append([]) true_attack_types = df["attack"].values.tolist() for i, d in enumerate(cproj): data_per_true_labels[true_attack_types[i]].append(d) A = affinity.get_affinity_matrix(cproj, metric_method=distance.cosdist, knn=8) k = predict_k(A) logger.debug("supposed k : " + str(k)) lim = int(len(df) * 0.01) if lim < 3 or lim > 10 : lim = 10 k = lim logger.debug("Total number of clusters : " + str(k)) logger.debug(A) sc = SpectralClustering(n_clusters=k, affinity="precomputed", assign_labels="kmeans").fit(A) res = sc.labels_ logger.debug(res) clusters = [0] * k for i, p in enumerate(cproj): true_label = true_attack_types[i] if true_label == model.attack_normal : clusters[ res[i] ] = clusters[ res[i] ] + 1 else : clusters[ res[i] ] = clusters[ res[i] ] - 1 print_confusion_matrix(true_attack_types, clusters, res, highlight_point) logger.debug("Cluster count") counts = [0] * k for _, c in enumerate(res): counts[c] = counts[c] + 1 logger.debug(str(counts)) print "save to file..." + title with open(today + "/" + title + '_cproj.pkl','wb') as output: pickle.dump(cproj, output, -1) with open(today + '/./' + title + '_res.pkl','wb') as output: pickle.dump(res, output, -1) with open(today + '/./' + title + '_df.pkl','wb') as output: pickle.dump(df, output, -1) with open(today + '/./' + title + '_highlight_point.pkl','wb') as output: pickle.dump(highlight_point, output, -1)
"b", "g", "r", "c", "m", "k", "w", "0.20", "0.75", "#eeefff", "#000fff", "#235234", "#345454", "#5766723", "#263543", "#078787", "#567576", "#745655", "#958673", "#262434", "#dd2453", "#eee253", "#fff332" ] import time start = time.time() df, headers, gmms = preprocessing.get_preprocessed_data() df = df[0:100] df_train = copy.deepcopy(df) df_train.drop('attack', 1, inplace=True) df_train.drop('difficulty', 1, inplace=True) print "reductioning..." proj = reduction.gmm_reduction(df_train, headers, gmms) A = affinity.get_affinity_matrix(proj, metric_method=distance.cosdist, knn=5) D = affinity.get_degree_matrix(A) print A elapsed = (time.time() - start) print "done in %s seconds" % (elapsed) plt.show()
def test_clustering(df, gmms, title="", save_to_file=False, highlight_point=None): # preprocessing df_train = copy.deepcopy(df) df_train.drop('attack',1,inplace=True) df_train.drop('difficulty',1,inplace=True) # from about 30 dimension to 2 dimension proj = reduction.gmm_reduction(df_train, headers, gmms) cproj = copy.deepcopy(proj) # data_per_true_labels : try to make sort of dictionary per each label data_per_true_labels = [] for i in range( len(attacks) ): data_per_true_labels.append([]) true_attack_types = df["attack"].values.tolist() for i, d in enumerate(cproj): data_per_true_labels[true_attack_types[i]].append(d) A = affinity.get_affinity_matrix(cproj, metric_method=distance.cosdist, knn=8) k = predict_k(A) print "supposed k : " + str(k) lim = int(len(df) * 0.01) lim = 12 # if lim < 3 or lim > 10 : # lim = 10 k = lim print "Total number of clusters : " + str(k) sc = SpectralClustering(n_clusters=k, affinity="precomputed", assign_labels="kmeans").fit(A) res = sc.labels_ # cluster data set clusters = [0] * k clusters_data = [] clusters_xmean = [-1] * k clusters_ymean = [-1] * k clusters_xstd = [-1] * k clusters_ystd = [-1] * k for i in range(k) : clusters_data.append([]) for i, p in enumerate(cproj): true_label = true_attack_types[i] if true_label == model.attack_normal : clusters[ res[i] ] = clusters[ res[i] ] + 1 else : clusters[ res[i] ] = clusters[ res[i] ] - 1 clusters_data[ res[i] ].append(p) # cluster recheck with density for i, cluster in enumerate(clusters) : p = clusters_data[i] x = np.array([t[0] for t in p]) y = np.array([t[1] for t in p]) clusters_xmean[i] = np.mean(x) clusters_ymean[i] = np.mean(y) clusters_xstd[i] = np.std(x) clusters_ystd[i] = np.std(y) ds = [] for i, cluster in enumerate(clusters) : if cluster > 0 : d = check_abnormal_with_density(clusters_xmean[i], clusters_ymean[i], clusters_xstd[i], clusters_ystd[i], len(clusters_data[i])) ds.append(d) if 0 > d: clusters[i] = -99999 else : ds.append(None) print ("ds") print ds
def test_clustering(df, gmms, title="", save_to_file=False, highlight_point=None): df_train = copy.deepcopy(df) df_train.drop('attack', 1, inplace=True) df_train.drop('difficulty', 1, inplace=True) # from about 30 dimension to 2 dimension proj = reduction.gmm_reduction(df_train, headers, gmms) cproj = copy.deepcopy(proj) # data_per_true_labels : try to make sort of dictionary per each label data_per_true_labels = [] for i in range(len(attacks)): data_per_true_labels.append([]) true_attack_types = df["attack"].values.tolist() for i, d in enumerate(cproj): data_per_true_labels[true_attack_types[i]].append(d) A = affinity.get_affinity_matrix(cproj, metric_method=distance.cosdist, knn=8) k = predict_k(A) logger.debug("supposed k : " + str(k)) lim = int(len(df) * 0.01) if lim < 3 or lim > 10: lim = 10 k = lim logger.debug("Total number of clusters : " + str(k)) logger.debug(A) sc = SpectralClustering(n_clusters=k, affinity="precomputed", assign_labels="kmeans").fit(A) res = sc.labels_ logger.debug(res) clusters = [0] * k for i, p in enumerate(cproj): true_label = true_attack_types[i] if true_label == model.attack_normal: clusters[res[i]] = clusters[res[i]] + 1 else: clusters[res[i]] = clusters[res[i]] - 1 print_confusion_matrix(true_attack_types, clusters, res, highlight_point) logger.debug("Cluster count") counts = [0] * k for _, c in enumerate(res): counts[c] = counts[c] + 1 logger.debug(str(counts)) print "save to file..." + title with open(today + "/" + title + '_cproj.pkl', 'wb') as output: pickle.dump(cproj, output, -1) with open(today + '/./' + title + '_res.pkl', 'wb') as output: pickle.dump(res, output, -1) with open(today + '/./' + title + '_df.pkl', 'wb') as output: pickle.dump(df, output, -1) with open(today + '/./' + title + '_highlight_point.pkl', 'wb') as output: pickle.dump(highlight_point, output, -1)
def test_clustering(df, gmms, title="", save_to_file=False, point=None): df_train = copy.deepcopy(df) true_values = df_train["attack"].values.tolist() df_train.drop('attack',1,inplace=True) df_train.drop('difficulty',1,inplace=True) # print "reductioning..." proj = reduction.gmm_reduction(df_train, headers, gmms) cproj = copy.deepcopy(proj) # print "plotting..." data_per_true_labels = [] for i in range( len(attacks) ): data_per_true_labels.append([]) true_attack_types = df["attack"].values.tolist() for i, d in enumerate(cproj): data_per_true_labels[true_attack_types[i]].append(d) fig, axarr = plt.subplots(3, 4, sharex='col', sharey='row') plt.subplots_adjust(wspace=0.4, hspace=0.4) plt.xlim(plot_lim_min, plot_lim_max) plt.ylim(plot_lim_min, plot_lim_max) ax1 = axarr[0, 0] ax2 = axarr[0, 1] ax3 = axarr[0, 2] ax4 = axarr[0, 3] ax5 = axarr[1, 0] ax6 = axarr[1, 1] ax7 = axarr[1, 2] ax8 = axarr[1, 3] ax9 = axarr[2, 0] ax10 = axarr[2, 1] ax11 = axarr[2, 2] ax12 = axarr[2, 3] ax1.set_title("True labels") for i, p in enumerate(data_per_true_labels) : x = [t[0] for t in p] y = [t[1] for t in p] x = np.array(x) y = np.array(y) colors = [] if point == None : if i == model.attack_normal: colors.append('g') else : colors.append('r') # for _ in range(len(x)): # colors.append(colorhex.codes[i]) else : for _ in range(len(x)): if i == point : colors.append(colorhex.codes[i]) elif i == model.attack_normal: colors.append('g') else : colors.append('r') ax1.scatter(x, y, c=colors) ############################################################## ax2.set_title("True normal") for i, p in enumerate(data_per_true_labels) : x = [t[0] for t in p] y = [t[1] for t in p] x = np.array(x) y = np.array(y) if i == model.attack_normal: ax2.scatter(x, y, c='g') ############################################################## ax3.set_title("True abnormal") for i, p in enumerate(data_per_true_labels) : x = [t[0] for t in p] y = [t[1] for t in p] x = np.array(x) y = np.array(y) if i != model.attack_normal: ax3.scatter(x, y, c='r') ############################################################## # A = affinity.get_affinity_matrix(proj, metric_method=distance.dist, metric_param='euclidean', knn=8) A = affinity.get_affinity_matrix(proj, metric_method=distance.cosdist, knn=8) # D = affinity.get_degree_matrix(A) # L = affinity.get_laplacian_matrix(A,D) # X = solver.solve(L) # est = KMeans(n_clusters=k) # est.fit(cproj) # res = est.labels_ k = predict_k(A) print "supposed k : " + str(k) lim = int(len(df) * 0.1) if k == 1 : k = lim if k > lim : k = lim print "Total number of clusters : " + str(k) sc = SpectralClustering(n_clusters=k, affinity="precomputed", assign_labels="kmeans").fit(A) res = sc.labels_
def test_clustering(df, gmms, title="", save_to_file=False, highlight_point=None): # preprocessing df_train = copy.deepcopy(df) df_train.drop('attack', 1, inplace=True) df_train.drop('difficulty', 1, inplace=True) # from about 30 dimension to 2 dimension proj = reduction.gmm_reduction(df_train, headers, gmms) cproj = copy.deepcopy(proj) # data_per_true_labels : try to make sort of dictionary per each label data_per_true_labels = [] for i in range(len(attacks)): data_per_true_labels.append([]) true_attack_types = df["attack"].values.tolist() for i, d in enumerate(cproj): data_per_true_labels[true_attack_types[i]].append(d) A = affinity.get_affinity_matrix(cproj, metric_method=distance.cosdist, knn=8) k = predict_k(A) print "supposed k : " + str(k) lim = int(len(df) * 0.01) lim = 12 # if lim < 3 or lim > 10 : # lim = 10 k = lim print "Total number of clusters : " + str(k) sc = SpectralClustering(n_clusters=k, affinity="precomputed", assign_labels="kmeans").fit(A) res = sc.labels_ # cluster data set clusters = [0] * k clusters_data = [] clusters_xmean = [-1] * k clusters_ymean = [-1] * k clusters_xstd = [-1] * k clusters_ystd = [-1] * k for i in range(k): clusters_data.append([]) for i, p in enumerate(cproj): true_label = true_attack_types[i] if true_label == model.attack_normal: clusters[res[i]] = clusters[res[i]] + 1 else: clusters[res[i]] = clusters[res[i]] - 1 clusters_data[res[i]].append(p) # cluster recheck with density for i, cluster in enumerate(clusters): p = clusters_data[i] x = np.array([t[0] for t in p]) y = np.array([t[1] for t in p]) clusters_xmean[i] = np.mean(x) clusters_ymean[i] = np.mean(y) clusters_xstd[i] = np.std(x) clusters_ystd[i] = np.std(y) ds = [] for i, cluster in enumerate(clusters): if cluster > 0: d = check_abnormal_with_density(clusters_xmean[i], clusters_ymean[i], clusters_xstd[i], clusters_ystd[i], len(clusters_data[i])) ds.append(d) if 0 > d: clusters[i] = -99999 else: ds.append(None) print("ds") print ds
def test_clustering(df, gmms, title="", save_to_file=False, highlight_point=None): # preprocessing df_train = copy.deepcopy(df) df_train.drop('attack',1,inplace=True) df_train.drop('difficulty',1,inplace=True) # from about 30 dimension to 2 dimension proj = reduction.gmm_reduction(df_train, headers, gmms) cproj = copy.deepcopy(proj) # data_per_true_labels : try to make sort of dictionary per each label data_per_true_labels = [] for i in range( len(attacks) ): data_per_true_labels.append([]) true_attack_types = df["attack"].values.tolist() for i, d in enumerate(cproj): data_per_true_labels[true_attack_types[i]].append(d) A = affinity.get_affinity_matrix(cproj, metric_method=distance.cosdist, knn=8) k = predict_k(A) logger.debug("supposed k : " + str(k)) # lim = int(len(df) * 0.01) # lim = 12 # if lim < 3 or lim > 10 : # lim = 10 lim = int( len(proj) * 12/500.0 ) k = lim logger.debug("Total number of clusters : " + str(k)) logger.debug(A) sc = SpectralClustering(n_clusters=k, affinity="precomputed", assign_labels="kmeans").fit(A) res = sc.labels_ logger.debug(res) # cluster data set clusters = [0] * k clusters_data = [] clusters_xmean = [-1] * k clusters_ymean = [-1] * k clusters_xstd = [-1] * k clusters_ystd = [-1] * k for i in range(k) : clusters_data.append([]) for i, p in enumerate(cproj): true_label = true_attack_types[i] if true_label == model.attack_normal : clusters[ res[i] ] = clusters[ res[i] ] + 1 else : clusters[ res[i] ] = clusters[ res[i] ] - 1 clusters_data[ res[i] ].append(p) # cluster recheck with density for i, cluster in enumerate(clusters) : p = clusters_data[i] x = np.array([t[0] for t in p]) y = np.array([t[1] for t in p]) clusters_xmean[i] = np.mean(x) clusters_ymean[i] = np.mean(y) clusters_xstd[i] = np.std(x) clusters_ystd[i] = np.std(y) ds = [] for i, cluster in enumerate(clusters) : if cluster > 0 : d = check_abnormal_with_density(clusters_xmean[i], clusters_ymean[i], clusters_xstd[i], clusters_ystd[i], len(clusters_data[i])) ds.append(d) if 0 > d: clusters[i] = -99999 else : ds.append(None) logger.debug("ds") logger.debug(ds) # report print_confusion_matrix(true_attack_types, clusters, res, highlight_point, clusters_xmean, clusters_ymean, clusters_xstd, clusters_ystd) logger.debug("Clusters") logger.debug(clusters) counts = [0] * k for _, c in enumerate(res): counts[c] = counts[c] + 1 logger.debug("Cluster datacount") logger.debug(str(counts)) # save to file print "save to file..." + title with open(today + "/" + title + '_cproj.pkl','wb') as output: pickle.dump(cproj, output, -1) with open(today + '/./' + title + '_res.pkl','wb') as output: pickle.dump(res, output, -1) with open(today + '/./' + title + '_df.pkl','wb') as output: pickle.dump(df, output, -1) with open(today + "/" + title + '_clusters_xmean.pkl','wb') as output: pickle.dump(clusters_xmean, output, -1) with open(today + "/" + title + '_clusters_ymean.pkl','wb') as output: pickle.dump(clusters_ymean, output, -1) with open(today + "/" + title + '_clusters_xstd.pkl','wb') as output: pickle.dump(clusters_xstd, output, -1) with open(today + "/" + title + '_clusters_ystd.pkl','wb') as output: pickle.dump(clusters_ystd, output, -1) with open(today + '/./' + title + '_highlight_point.pkl','wb') as output: pickle.dump(highlight_point, output, -1)
def test_clustering(df, gmms, title="", save_to_file=False, highlight_point=None): # preprocessing df_train = copy.deepcopy(df) df_train.drop('attack', 1, inplace=True) df_train.drop('difficulty', 1, inplace=True) # from about 30 dimension to 2 dimension proj = reduction.gmm_reduction(df_train, headers, gmms) cproj = copy.deepcopy(proj) # data_per_true_labels : try to make sort of dictionary per each label data_per_true_labels = [] for i in range(len(attacks)): data_per_true_labels.append([]) true_attack_types = df["attack"].values.tolist() for i, d in enumerate(cproj): data_per_true_labels[true_attack_types[i]].append(d) A = affinity.get_affinity_matrix(cproj, metric_method=distance.cosdist, knn=8) k = predict_k(A) logger.debug("supposed k : " + str(k)) # lim = int(len(df) * 0.01) # lim = 12 # if lim < 3 or lim > 10 : # lim = 10 lim = int(len(proj) * 12 / 500.0) k = lim logger.debug("Total number of clusters : " + str(k)) logger.debug(A) sc = SpectralClustering(n_clusters=k, affinity="precomputed", assign_labels="kmeans").fit(A) res = sc.labels_ logger.debug(res) # cluster data set clusters = [0] * k clusters_data = [] clusters_xmean = [-1] * k clusters_ymean = [-1] * k clusters_xstd = [-1] * k clusters_ystd = [-1] * k for i in range(k): clusters_data.append([]) for i, p in enumerate(cproj): true_label = true_attack_types[i] if true_label == model.attack_normal: clusters[res[i]] = clusters[res[i]] + 1 else: clusters[res[i]] = clusters[res[i]] - 1 clusters_data[res[i]].append(p) # cluster recheck with density for i, cluster in enumerate(clusters): p = clusters_data[i] x = np.array([t[0] for t in p]) y = np.array([t[1] for t in p]) clusters_xmean[i] = np.mean(x) clusters_ymean[i] = np.mean(y) clusters_xstd[i] = np.std(x) clusters_ystd[i] = np.std(y) ds = [] for i, cluster in enumerate(clusters): if cluster > 0: d = check_abnormal_with_density(clusters_xmean[i], clusters_ymean[i], clusters_xstd[i], clusters_ystd[i], len(clusters_data[i])) ds.append(d) if 0 > d: clusters[i] = -99999 else: ds.append(None) logger.debug("ds") logger.debug(ds) # report print_confusion_matrix(true_attack_types, clusters, res, highlight_point, clusters_xmean, clusters_ymean, clusters_xstd, clusters_ystd) logger.debug("Clusters") logger.debug(clusters) counts = [0] * k for _, c in enumerate(res): counts[c] = counts[c] + 1 logger.debug("Cluster datacount") logger.debug(str(counts)) # save to file print "save to file..." + title with open(today + "/" + title + '_cproj.pkl', 'wb') as output: pickle.dump(cproj, output, -1) with open(today + '/./' + title + '_res.pkl', 'wb') as output: pickle.dump(res, output, -1) with open(today + '/./' + title + '_df.pkl', 'wb') as output: pickle.dump(df, output, -1) with open(today + "/" + title + '_clusters_xmean.pkl', 'wb') as output: pickle.dump(clusters_xmean, output, -1) with open(today + "/" + title + '_clusters_ymean.pkl', 'wb') as output: pickle.dump(clusters_ymean, output, -1) with open(today + "/" + title + '_clusters_xstd.pkl', 'wb') as output: pickle.dump(clusters_xstd, output, -1) with open(today + "/" + title + '_clusters_ystd.pkl', 'wb') as output: pickle.dump(clusters_ystd, output, -1) with open(today + '/./' + title + '_highlight_point.pkl', 'wb') as output: pickle.dump(highlight_point, output, -1)