def do_one_clustering(df, gmms):
    df_train = copy.deepcopy(df)
    proj = reduction.gmm_reduction(df_train, headers, gmms)
    cproj = copy.deepcopy(proj)
    A = affinity.get_affinity_matrix(cproj, metric_method=distance.cosdist, knn=8)
    sc = SpectralClustering(n_clusters=k,
                            affinity="precomputed",
                            assign_labels="kmeans").fit(A)
    res = sc.labels_
    return res, cproj
def do_one_clustering(df, gmms):
    df_train = copy.deepcopy(df)
    proj = reduction.gmm_reduction(df_train, headers, gmms)
    cproj = copy.deepcopy(proj)
    A = affinity.get_affinity_matrix(cproj,
                                     metric_method=distance.cosdist,
                                     knn=8)
    sc = SpectralClustering(n_clusters=k,
                            affinity="precomputed",
                            assign_labels="kmeans").fit(A)
    res = sc.labels_
    return res, cproj
    df_training_20, df_training_full, gmms_training_20, gmms_training_full = preprocessing.get_preprocessed_training_data()
    df_test_20, df_test_full, gmms_test_20, gmms_test_full = preprocessing.get_preprocessed_test_data()

    df = df_training_20
    gmms = gmms_training_20
    df = df[0:1000]

    df_train = copy.deepcopy(df)
    df_train.drop('attack',1,inplace=True)
    df_train.drop('difficulty',1,inplace=True)
    headers.remove('protocol_type')
    headers.remove('attack')
    headers.remove('difficulty')

    print "reductioning..."
    proj = reduction.gmm_reduction(df_train, headers, gmms)

    print "plotting..."
    true_labels = []
    for i in range( len(attacks) ):
        true_labels.append([])

    attacks = df["attack"].values.tolist()

    for i, d in enumerate(proj):
        true_labels[attacks[i]].append(d)

    # title for the plots
    titles = ['Normal data',
              'Abnormal data',
              'Data']
def test_clustering(df, gmms, title="", save_to_file=False, highlight_point=None):
    df_train = copy.deepcopy(df)
    df_train.drop('attack',1,inplace=True)
    df_train.drop('difficulty',1,inplace=True)

    # from about 30 dimension to 2 dimension
    proj = reduction.gmm_reduction(df_train, headers, gmms)
    cproj = copy.deepcopy(proj)

    # data_per_true_labels : try to make sort of dictionary per each label
    data_per_true_labels = []
    for i in range( len(attacks) ):
        data_per_true_labels.append([])

    true_attack_types = df["attack"].values.tolist()
    for i, d in enumerate(cproj):
        data_per_true_labels[true_attack_types[i]].append(d)

    A = affinity.get_affinity_matrix(cproj, metric_method=distance.cosdist, knn=8)

    k = predict_k(A)
    logger.debug("supposed k : " + str(k))

    lim = int(len(df) * 0.01)
    if lim < 3 or lim > 10 :
        lim = 10
    k = lim
    logger.debug("Total number of clusters : " + str(k))

    logger.debug(A)
    sc = SpectralClustering(n_clusters=k,
                            affinity="precomputed",
                            assign_labels="kmeans").fit(A)
    res = sc.labels_
    logger.debug(res)

    clusters = [0] * k
    for i, p in enumerate(cproj):
        true_label = true_attack_types[i]
        if true_label == model.attack_normal :
            clusters[ res[i] ] = clusters[ res[i] ] + 1
        else :
            clusters[ res[i] ] = clusters[ res[i] ] - 1

    print_confusion_matrix(true_attack_types, clusters, res, highlight_point)

    logger.debug("Cluster count")
    counts = [0] * k
    for _, c in enumerate(res):
        counts[c] = counts[c] + 1
    logger.debug(str(counts))

    print "save to file..." + title
    with open(today + "/" + title + '_cproj.pkl','wb') as output:
        pickle.dump(cproj, output, -1)
    with open(today + '/./' + title + '_res.pkl','wb') as output:
        pickle.dump(res, output, -1)
    with open(today + '/./' + title + '_df.pkl','wb') as output:
        pickle.dump(df, output, -1)
    with open(today + '/./' + title + '_highlight_point.pkl','wb') as output:
        pickle.dump(highlight_point, output, -1)
        "b", "g", "r", "c", "m", "k", "w", "0.20", "0.75", "#eeefff",
        "#000fff", "#235234", "#345454", "#5766723", "#263543", "#078787",
        "#567576", "#745655", "#958673", "#262434", "#dd2453", "#eee253",
        "#fff332"
    ]

    import time
    start = time.time()

    df, headers, gmms = preprocessing.get_preprocessed_data()
    df = df[0:100]

    df_train = copy.deepcopy(df)
    df_train.drop('attack', 1, inplace=True)
    df_train.drop('difficulty', 1, inplace=True)

    print "reductioning..."
    proj = reduction.gmm_reduction(df_train, headers, gmms)

    A = affinity.get_affinity_matrix(proj,
                                     metric_method=distance.cosdist,
                                     knn=5)
    D = affinity.get_degree_matrix(A)

    print A

    elapsed = (time.time() - start)
    print "done in %s seconds" % (elapsed)

    plt.show()
def test_clustering(df, gmms, title="", save_to_file=False, highlight_point=None):
    # preprocessing
    df_train = copy.deepcopy(df)
    df_train.drop('attack',1,inplace=True)
    df_train.drop('difficulty',1,inplace=True)

    # from about 30 dimension to 2 dimension
    proj = reduction.gmm_reduction(df_train, headers, gmms)
    cproj = copy.deepcopy(proj)

    # data_per_true_labels : try to make sort of dictionary per each label
    data_per_true_labels = []
    for i in range( len(attacks) ):
        data_per_true_labels.append([])

    true_attack_types = df["attack"].values.tolist()
    for i, d in enumerate(cproj):
        data_per_true_labels[true_attack_types[i]].append(d)

    A = affinity.get_affinity_matrix(cproj, metric_method=distance.cosdist, knn=8)

    k = predict_k(A)
    print "supposed k : " + str(k)

    lim = int(len(df) * 0.01)
    lim = 12
#    if lim < 3 or lim > 10 :
#        lim = 10
    k = lim
    print "Total number of clusters : " + str(k)

    sc = SpectralClustering(n_clusters=k,
                            affinity="precomputed",
                            assign_labels="kmeans").fit(A)
    res = sc.labels_

    # cluster data set
    clusters = [0] * k
    clusters_data = []
    clusters_xmean = [-1] * k
    clusters_ymean = [-1] * k
    clusters_xstd = [-1] * k
    clusters_ystd = [-1] * k
    for i in range(k) :
        clusters_data.append([])
    for i, p in enumerate(cproj):
        true_label = true_attack_types[i]
        if true_label == model.attack_normal :
            clusters[ res[i] ] = clusters[ res[i] ] + 1
        else :
            clusters[ res[i] ] = clusters[ res[i] ] - 1
        clusters_data[ res[i] ].append(p)

    # cluster recheck with density
    for i, cluster in enumerate(clusters) :
        p = clusters_data[i]
        x = np.array([t[0] for t in p])
        y = np.array([t[1] for t in p])
        clusters_xmean[i] = np.mean(x)
        clusters_ymean[i] = np.mean(y)
        clusters_xstd[i] = np.std(x)
        clusters_ystd[i] = np.std(y)

    ds = []
    for i, cluster in enumerate(clusters) :
        if cluster > 0 :
            d = check_abnormal_with_density(clusters_xmean[i],
                clusters_ymean[i],
                clusters_xstd[i],
                clusters_ystd[i],
                len(clusters_data[i]))
            ds.append(d)
            if 0 > d:
                clusters[i] = -99999
        else :
            ds.append(None)
    print ("ds")
    print ds
def test_clustering(df,
                    gmms,
                    title="",
                    save_to_file=False,
                    highlight_point=None):
    df_train = copy.deepcopy(df)
    df_train.drop('attack', 1, inplace=True)
    df_train.drop('difficulty', 1, inplace=True)

    # from about 30 dimension to 2 dimension
    proj = reduction.gmm_reduction(df_train, headers, gmms)
    cproj = copy.deepcopy(proj)

    # data_per_true_labels : try to make sort of dictionary per each label
    data_per_true_labels = []
    for i in range(len(attacks)):
        data_per_true_labels.append([])

    true_attack_types = df["attack"].values.tolist()
    for i, d in enumerate(cproj):
        data_per_true_labels[true_attack_types[i]].append(d)

    A = affinity.get_affinity_matrix(cproj,
                                     metric_method=distance.cosdist,
                                     knn=8)

    k = predict_k(A)
    logger.debug("supposed k : " + str(k))

    lim = int(len(df) * 0.01)
    if lim < 3 or lim > 10:
        lim = 10
    k = lim
    logger.debug("Total number of clusters : " + str(k))

    logger.debug(A)
    sc = SpectralClustering(n_clusters=k,
                            affinity="precomputed",
                            assign_labels="kmeans").fit(A)
    res = sc.labels_
    logger.debug(res)

    clusters = [0] * k
    for i, p in enumerate(cproj):
        true_label = true_attack_types[i]
        if true_label == model.attack_normal:
            clusters[res[i]] = clusters[res[i]] + 1
        else:
            clusters[res[i]] = clusters[res[i]] - 1

    print_confusion_matrix(true_attack_types, clusters, res, highlight_point)

    logger.debug("Cluster count")
    counts = [0] * k
    for _, c in enumerate(res):
        counts[c] = counts[c] + 1
    logger.debug(str(counts))

    print "save to file..." + title
    with open(today + "/" + title + '_cproj.pkl', 'wb') as output:
        pickle.dump(cproj, output, -1)
    with open(today + '/./' + title + '_res.pkl', 'wb') as output:
        pickle.dump(res, output, -1)
    with open(today + '/./' + title + '_df.pkl', 'wb') as output:
        pickle.dump(df, output, -1)
    with open(today + '/./' + title + '_highlight_point.pkl', 'wb') as output:
        pickle.dump(highlight_point, output, -1)
def test_clustering(df, gmms, title="", save_to_file=False, point=None):
    df_train = copy.deepcopy(df)
    true_values = df_train["attack"].values.tolist()
    df_train.drop('attack',1,inplace=True)
    df_train.drop('difficulty',1,inplace=True)

#    print "reductioning..."
    proj = reduction.gmm_reduction(df_train, headers, gmms)
    cproj = copy.deepcopy(proj)

#    print "plotting..."
    data_per_true_labels = []
    for i in range( len(attacks) ):
        data_per_true_labels.append([])
    true_attack_types = df["attack"].values.tolist()

    for i, d in enumerate(cproj):
        data_per_true_labels[true_attack_types[i]].append(d)

    fig, axarr = plt.subplots(3, 4, sharex='col', sharey='row')
    plt.subplots_adjust(wspace=0.4, hspace=0.4)
    plt.xlim(plot_lim_min, plot_lim_max)
    plt.ylim(plot_lim_min, plot_lim_max)

    ax1 = axarr[0, 0]
    ax2 = axarr[0, 1]
    ax3 = axarr[0, 2]
    ax4 = axarr[0, 3]
    ax5 = axarr[1, 0]
    ax6 = axarr[1, 1]
    ax7 = axarr[1, 2]
    ax8 = axarr[1, 3]
    ax9 = axarr[2, 0]
    ax10 = axarr[2, 1]
    ax11 = axarr[2, 2]
    ax12 = axarr[2, 3]

    ax1.set_title("True labels")
    for i, p in enumerate(data_per_true_labels) :
        x = [t[0] for t in p]
        y = [t[1] for t in p]
        x = np.array(x)
        y = np.array(y)
        colors = []
        if point == None :
            if i == model.attack_normal:
                colors.append('g')
            else :
                colors.append('r')
#            for _ in range(len(x)):
#                colors.append(colorhex.codes[i])
        else :
            for _ in range(len(x)):
                if i == point :
                    colors.append(colorhex.codes[i])
                elif i == model.attack_normal:
                    colors.append('g')
                else :
                    colors.append('r')

        ax1.scatter(x, y, c=colors)


##############################################################
    ax2.set_title("True normal")
    for i, p in enumerate(data_per_true_labels) :
        x = [t[0] for t in p]
        y = [t[1] for t in p]
        x = np.array(x)
        y = np.array(y)
        if i == model.attack_normal:
            ax2.scatter(x, y, c='g')
##############################################################
    ax3.set_title("True abnormal")
    for i, p in enumerate(data_per_true_labels) :
        x = [t[0] for t in p]
        y = [t[1] for t in p]
        x = np.array(x)
        y = np.array(y)
        if i != model.attack_normal:
            ax3.scatter(x, y, c='r')
##############################################################
#    A = affinity.get_affinity_matrix(proj, metric_method=distance.dist, metric_param='euclidean', knn=8)
    A = affinity.get_affinity_matrix(proj, metric_method=distance.cosdist, knn=8)
#    D = affinity.get_degree_matrix(A)
#    L = affinity.get_laplacian_matrix(A,D)
#    X = solver.solve(L)
#    est = KMeans(n_clusters=k)
#    est.fit(cproj)
#    res = est.labels_

    k = predict_k(A)
    print "supposed k : " + str(k)

    lim = int(len(df) * 0.1)
    if k == 1 :
        k = lim
    if k > lim :
        k = lim
    print "Total number of clusters : " + str(k)

    sc = SpectralClustering(n_clusters=k,
                            affinity="precomputed",
                            assign_labels="kmeans").fit(A)

    res = sc.labels_
def test_clustering(df,
                    gmms,
                    title="",
                    save_to_file=False,
                    highlight_point=None):
    # preprocessing
    df_train = copy.deepcopy(df)
    df_train.drop('attack', 1, inplace=True)
    df_train.drop('difficulty', 1, inplace=True)

    # from about 30 dimension to 2 dimension
    proj = reduction.gmm_reduction(df_train, headers, gmms)
    cproj = copy.deepcopy(proj)

    # data_per_true_labels : try to make sort of dictionary per each label
    data_per_true_labels = []
    for i in range(len(attacks)):
        data_per_true_labels.append([])

    true_attack_types = df["attack"].values.tolist()
    for i, d in enumerate(cproj):
        data_per_true_labels[true_attack_types[i]].append(d)

    A = affinity.get_affinity_matrix(cproj,
                                     metric_method=distance.cosdist,
                                     knn=8)

    k = predict_k(A)
    print "supposed k : " + str(k)

    lim = int(len(df) * 0.01)
    lim = 12
    #    if lim < 3 or lim > 10 :
    #        lim = 10
    k = lim
    print "Total number of clusters : " + str(k)

    sc = SpectralClustering(n_clusters=k,
                            affinity="precomputed",
                            assign_labels="kmeans").fit(A)
    res = sc.labels_

    # cluster data set
    clusters = [0] * k
    clusters_data = []
    clusters_xmean = [-1] * k
    clusters_ymean = [-1] * k
    clusters_xstd = [-1] * k
    clusters_ystd = [-1] * k
    for i in range(k):
        clusters_data.append([])
    for i, p in enumerate(cproj):
        true_label = true_attack_types[i]
        if true_label == model.attack_normal:
            clusters[res[i]] = clusters[res[i]] + 1
        else:
            clusters[res[i]] = clusters[res[i]] - 1
        clusters_data[res[i]].append(p)

    # cluster recheck with density
    for i, cluster in enumerate(clusters):
        p = clusters_data[i]
        x = np.array([t[0] for t in p])
        y = np.array([t[1] for t in p])
        clusters_xmean[i] = np.mean(x)
        clusters_ymean[i] = np.mean(y)
        clusters_xstd[i] = np.std(x)
        clusters_ystd[i] = np.std(y)

    ds = []
    for i, cluster in enumerate(clusters):
        if cluster > 0:
            d = check_abnormal_with_density(clusters_xmean[i],
                                            clusters_ymean[i],
                                            clusters_xstd[i], clusters_ystd[i],
                                            len(clusters_data[i]))
            ds.append(d)
            if 0 > d:
                clusters[i] = -99999
        else:
            ds.append(None)
    print("ds")
    print ds
Exemple #10
0
def test_clustering(df, gmms, title="", save_to_file=False, highlight_point=None):
    # preprocessing
    df_train = copy.deepcopy(df)
    df_train.drop('attack',1,inplace=True)
    df_train.drop('difficulty',1,inplace=True)

    # from about 30 dimension to 2 dimension
    proj = reduction.gmm_reduction(df_train, headers, gmms)
    cproj = copy.deepcopy(proj)

    # data_per_true_labels : try to make sort of dictionary per each label
    data_per_true_labels = []
    for i in range( len(attacks) ):
        data_per_true_labels.append([])

    true_attack_types = df["attack"].values.tolist()
    for i, d in enumerate(cproj):
        data_per_true_labels[true_attack_types[i]].append(d)

    A = affinity.get_affinity_matrix(cproj, metric_method=distance.cosdist, knn=8)

    k = predict_k(A)
    logger.debug("supposed k : " + str(k))

#    lim = int(len(df) * 0.01)
#    lim = 12
#    if lim < 3 or lim > 10 :
#        lim = 10
    lim = int( len(proj) * 12/500.0  )
    k = lim
    logger.debug("Total number of clusters : " + str(k))

    logger.debug(A)
    sc = SpectralClustering(n_clusters=k,
                            affinity="precomputed",
                            assign_labels="kmeans").fit(A)
    res = sc.labels_
    logger.debug(res)

    # cluster data set
    clusters = [0] * k
    clusters_data = []
    clusters_xmean = [-1] * k
    clusters_ymean = [-1] * k
    clusters_xstd = [-1] * k
    clusters_ystd = [-1] * k
    for i in range(k) :
        clusters_data.append([])
    for i, p in enumerate(cproj):
        true_label = true_attack_types[i]
        if true_label == model.attack_normal :
            clusters[ res[i] ] = clusters[ res[i] ] + 1
        else :
            clusters[ res[i] ] = clusters[ res[i] ] - 1
        clusters_data[ res[i] ].append(p)

    # cluster recheck with density
    for i, cluster in enumerate(clusters) :
        p = clusters_data[i]
        x = np.array([t[0] for t in p])
        y = np.array([t[1] for t in p])
        clusters_xmean[i] = np.mean(x)
        clusters_ymean[i] = np.mean(y)
        clusters_xstd[i] = np.std(x)
        clusters_ystd[i] = np.std(y)

    ds = []
    for i, cluster in enumerate(clusters) :
        if cluster > 0 :
            d = check_abnormal_with_density(clusters_xmean[i],
                clusters_ymean[i],
                clusters_xstd[i],
                clusters_ystd[i],
                len(clusters_data[i]))
            ds.append(d)
            if 0 > d:
                clusters[i] = -99999
        else :
            ds.append(None)
    logger.debug("ds")
    logger.debug(ds)

    # report
    print_confusion_matrix(true_attack_types, clusters, res, highlight_point,
        clusters_xmean, clusters_ymean, clusters_xstd, clusters_ystd)

    logger.debug("Clusters")
    logger.debug(clusters)
    counts = [0] * k
    for _, c in enumerate(res):
        counts[c] = counts[c] + 1
    logger.debug("Cluster datacount")
    logger.debug(str(counts))

    # save to file
    print "save to file..." + title
    with open(today + "/" + title + '_cproj.pkl','wb') as output:
        pickle.dump(cproj, output, -1)
    with open(today + '/./' + title + '_res.pkl','wb') as output:
        pickle.dump(res, output, -1)
    with open(today + '/./' + title + '_df.pkl','wb') as output:
        pickle.dump(df, output, -1)
    with open(today + "/" + title + '_clusters_xmean.pkl','wb') as output:
        pickle.dump(clusters_xmean, output, -1)
    with open(today + "/" + title + '_clusters_ymean.pkl','wb') as output:
        pickle.dump(clusters_ymean, output, -1)
    with open(today + "/" + title + '_clusters_xstd.pkl','wb') as output:
        pickle.dump(clusters_xstd, output, -1)
    with open(today + "/" + title + '_clusters_ystd.pkl','wb') as output:
        pickle.dump(clusters_ystd, output, -1)
    with open(today + '/./' + title + '_highlight_point.pkl','wb') as output:
        pickle.dump(highlight_point, output, -1)
Exemple #11
0
def test_clustering(df,
                    gmms,
                    title="",
                    save_to_file=False,
                    highlight_point=None):
    # preprocessing
    df_train = copy.deepcopy(df)
    df_train.drop('attack', 1, inplace=True)
    df_train.drop('difficulty', 1, inplace=True)

    # from about 30 dimension to 2 dimension
    proj = reduction.gmm_reduction(df_train, headers, gmms)
    cproj = copy.deepcopy(proj)

    # data_per_true_labels : try to make sort of dictionary per each label
    data_per_true_labels = []
    for i in range(len(attacks)):
        data_per_true_labels.append([])

    true_attack_types = df["attack"].values.tolist()
    for i, d in enumerate(cproj):
        data_per_true_labels[true_attack_types[i]].append(d)

    A = affinity.get_affinity_matrix(cproj,
                                     metric_method=distance.cosdist,
                                     knn=8)

    k = predict_k(A)
    logger.debug("supposed k : " + str(k))

    #    lim = int(len(df) * 0.01)
    #    lim = 12
    #    if lim < 3 or lim > 10 :
    #        lim = 10
    lim = int(len(proj) * 12 / 500.0)
    k = lim
    logger.debug("Total number of clusters : " + str(k))

    logger.debug(A)
    sc = SpectralClustering(n_clusters=k,
                            affinity="precomputed",
                            assign_labels="kmeans").fit(A)
    res = sc.labels_
    logger.debug(res)

    # cluster data set
    clusters = [0] * k
    clusters_data = []
    clusters_xmean = [-1] * k
    clusters_ymean = [-1] * k
    clusters_xstd = [-1] * k
    clusters_ystd = [-1] * k
    for i in range(k):
        clusters_data.append([])
    for i, p in enumerate(cproj):
        true_label = true_attack_types[i]
        if true_label == model.attack_normal:
            clusters[res[i]] = clusters[res[i]] + 1
        else:
            clusters[res[i]] = clusters[res[i]] - 1
        clusters_data[res[i]].append(p)

    # cluster recheck with density
    for i, cluster in enumerate(clusters):
        p = clusters_data[i]
        x = np.array([t[0] for t in p])
        y = np.array([t[1] for t in p])
        clusters_xmean[i] = np.mean(x)
        clusters_ymean[i] = np.mean(y)
        clusters_xstd[i] = np.std(x)
        clusters_ystd[i] = np.std(y)

    ds = []
    for i, cluster in enumerate(clusters):
        if cluster > 0:
            d = check_abnormal_with_density(clusters_xmean[i],
                                            clusters_ymean[i],
                                            clusters_xstd[i], clusters_ystd[i],
                                            len(clusters_data[i]))
            ds.append(d)
            if 0 > d:
                clusters[i] = -99999
        else:
            ds.append(None)
    logger.debug("ds")
    logger.debug(ds)

    # report
    print_confusion_matrix(true_attack_types, clusters, res, highlight_point,
                           clusters_xmean, clusters_ymean, clusters_xstd,
                           clusters_ystd)

    logger.debug("Clusters")
    logger.debug(clusters)
    counts = [0] * k
    for _, c in enumerate(res):
        counts[c] = counts[c] + 1
    logger.debug("Cluster datacount")
    logger.debug(str(counts))

    # save to file
    print "save to file..." + title
    with open(today + "/" + title + '_cproj.pkl', 'wb') as output:
        pickle.dump(cproj, output, -1)
    with open(today + '/./' + title + '_res.pkl', 'wb') as output:
        pickle.dump(res, output, -1)
    with open(today + '/./' + title + '_df.pkl', 'wb') as output:
        pickle.dump(df, output, -1)
    with open(today + "/" + title + '_clusters_xmean.pkl', 'wb') as output:
        pickle.dump(clusters_xmean, output, -1)
    with open(today + "/" + title + '_clusters_ymean.pkl', 'wb') as output:
        pickle.dump(clusters_ymean, output, -1)
    with open(today + "/" + title + '_clusters_xstd.pkl', 'wb') as output:
        pickle.dump(clusters_xstd, output, -1)
    with open(today + "/" + title + '_clusters_ystd.pkl', 'wb') as output:
        pickle.dump(clusters_ystd, output, -1)
    with open(today + '/./' + title + '_highlight_point.pkl', 'wb') as output:
        pickle.dump(highlight_point, output, -1)