Ejemplo n.º 1
0
def binary_classify(
    data_train,
    positive,
    negative,
    positive_len,
    negative_len,
    expend,
    discrete_num_map,
    data_name,
):
    data_predict = []
    negative_len = positive_len - negative_len

    print('\033[0;36m--expending' + '-' * 16 + '\033[0m')
    vae_predict = gen_with_vae(negative, negative_len, data_name)
    smote_predict = gen_with_smote(negative, negative_len, data_name,
                                   discrete_num_map)

    for l in range(smote_predict.__len__()):
        for i in range(len(smote_predict[l].attr_list)):
            if smote_predict[l][i] == 'None':
                smote_predict[l][i] = vae_predict[l][i]

    data_predict = [
        fill_with_eucli_distance(data_train, predict, discrete_num_map)
        for predict in smote_predict
    ]

    sp_class = negative

    data_sp = [d for d in data_train if d.data_class == sp_class]
    for d in data_sp:
        data_train.remove(d)

    data_train.extend(data_predict)
    train_X = [
        d.discrete_to_num(discrete_num_map).attr_list for d in data_train
    ]
    test_X = [d.discrete_to_num(discrete_num_map).attr_list for d in data_sp]
    train_y = [get_kdd99_big_classification(d.data_class) for d in data_train]
    test_y = [get_kdd99_big_classification(d.data_class) for d in data_sp]

    # TODO PRINT
    print({k: train_y.count(k) for k in train_y})

    sp_class = get_kdd99_big_classification(sp_class)

    clf = svm.SVC()
    # clf = GaussianNB()
    # clf = tree.DecisionTreeClassifier()
    clf.fit(train_X, train_y)

    predict_y = [i for i in clf.predict(test_X)]

    print('test data class: ', {k: test_y.count(k) for k in test_y})
    print('predicted data class: ', {k: predict_y.count(k) for k in predict_y})

    print('acc_true: {:>2.3f}'.format(
        predict_y.count(sp_class) / test_y.count(sp_class)))
Ejemplo n.º 2
0
def binary_classify(data_train, positive, negative, positive_len, negative_len, discrete_num_map, data_name, expend):
    if expend:
        print('\033[0;36m--expending' + '-' * 16 + '\033[0m')
        negative_len = positive_len - negative_len

        vae_predict = gen_with_vae(negative, negative_len, data_name)
        smote_predict = gen_with_smote(negative, negative_len, data_name, discrete_num_map)

        for l in range(smote_predict.__len__()):
            for i in range(len(smote_predict[l].attr_list)):
                if smote_predict[l][i] == 'None':
                    smote_predict[l][i] = vae_predict[l][i]

        data_predict = [fill_with_eucli_distance(data_train, predict, discrete_num_map)
                        for predict in smote_predict]
        data_train.extend(data_predict)

    X = [d.discrete_to_num(discrete_num_map).attr_list for d in data_train]
    y = [get_kdd99_big_classification(d.data_class) for d in data_train]

    # TODO PRINT
    print({k: y.count(k) for k in y})

    positive = get_kdd99_big_classification(positive)
    negative = get_kdd99_big_classification(negative)

    # Todo: kf
    kf = KFold(n_splits=5, shuffle=True)
    args = None
    for i_train, i_test in kf.split(X, y):
        train_X = [X[i] for i in i_train]
        train_y = [y[i] for i in i_train]
        test_X = [X[i] for i in i_test]
        test_y = [y[i] for i in i_test]
        # clf = svm.SVC(kernel='linear', probability=True,
        #               random_state=np.random.RandomState(0))
        clf = svm.SVC()
        # clf = GaussianNB()
        # clf = tree.DecisionTreeClassifier()
        clf.fit(train_X, train_y)

        predict_y = [i for i in clf.predict(test_X)]

        if args:
            args = [a + t for a, t in zip(args, compute_classification_indicators(
                *compute_TP_TN_FP_TN(test_y, predict_y, positive, negative)))]
        else:
            args = compute_classification_indicators(*compute_TP_TN_FP_TN(test_y, predict_y, positive, negative))
    print(
        'acc+: {:>2.3f}, acc-: {:>2.3f}, accuracy: {:>2.3f}, precision: {:>2.3f}, recall: {:>2.3f}, F1: {:>2.3f}, '
        'G-mean: {:>2.3f} '
            .format(*[arg / 5 for arg in args]))
Ejemplo n.º 3
0
def binary_classify(data_train_total, positive, negative, pos_len, neg_len, expend, data_map, data_name,
                    using_kdd99=False, vae_only=False):
    if expend:
        print('expending-------------------------------------------------------')
        neg_len = pos_len - neg_len
        # vae_predict = gen_with_multi_vae(negative, neg_len, data_name)
        vae_predict = gen_with_vae(negative, neg_len, data_name)

        if using_kdd99:
            smote = MySmote(data_name, target_class=negative, data_map=data_map)
            smote_predict = smote.predict(
                target_len=neg_len, data_map=data_map)

            for l in range(smote_predict.__len__()):
                for i in range(len(smote_predict[l].attr_list)):
                    if smote_predict[l][i] == 'None':
                        smote_predict[l][i] = vae_predict[l][i]
        else:
            smote_predict = vae_predict

        data_predict = []
        for p in smote_predict:
            res = fill_with_eucli_distance(data_train_total, p, data_map)
            data_predict.append(res)
        data_train_total.extend(data_predict)

    y = [d.data_class for d in data_train_total]
    if using_kdd99:
        positive = get_kdd99_big_classification(positive)
        negative = get_kdd99_big_classification(negative)
        y = [get_kdd99_big_classification(c) for c in y]

    if vae_only:
        data_train_total = [d.discrete_to_num(data_map=data_map).to_list([DataType.CONTINUOUS]) for d in
                            data_train_total]
    data_train_total = [d.discrete_to_num(data_map=data_map).attr_list for d in data_train_total]
    # Todo:
    x = np.array(data_train_total).astype(np.float)

    print({k: y.count(k) for k in y})

    # Todo: kf
    kf = KFold(n_splits=5, shuffle=True)
    args = []

    for i_train, i_test in kf.split(X=x, y=y):
        train_x = [x[i] for i in i_train]
        train_y = [y[i] for i in i_train]
        test_x = [x[i] for i in i_test]
        test_y = [y[i] for i in i_test]
        clf = svm.SVC(kernel='linear', probability=True,
                      random_state=np.random.RandomState(0))
        # clf = svm.SVC()
        # clf = GaussianNB()
        # clf = tree.DecisionTreeClassifier()
        clf.fit(train_x, train_y)

        predict_y = [i for i in clf.predict(test_x)]

        temp = compute_classification_indicators(*compute_TP_TN_FP_TN(test_y, predict_y, positive, negative))
        if args.__len__() == 0:
            args = temp
        else:
            args = [a + t for a, t in zip(args, temp)]
    # print(
    #     u'acc\u208A: {:>2.3f}, acc\u208B: {:>2.3f}, accuracy: {:>2.3f}, precision: {:>2.3f}, recall: {:>2.3f}, F1: {:>2.3f}, G-mean: {:>2.3f}'
    #         .format(*[a / 5 for a in args]))
    print(
        u'acc+: {:>2.3f}, acc-: {:>2.3f}, accuracy: {:>2.3f}, precision: {:>2.3f}, recall: {:>2.3f}, F1: {:>2.3f}, G-mean: {:>2.3f}'
            .format(*[a / 5 for a in args]))
    print('')
Ejemplo n.º 4
0
def multi_classify(data_train_total, positive, negatives, pos_len, negs_len, data_map, data_name, expend=False,
                   using_kdd99=False):
    if expend:
        print('-' * 16 + 'expending' + '-' * 16)
        for negative, neg_len in zip(negatives, negs_len):
            neg_len = pos_len - neg_len
            vae_predict = gen_with_vae(negative, neg_len, data_name)

            if using_kdd99:
                smote = MySmote(data_name, target_class=negative, data_map=data_map)
                smote_predict = smote.predict(
                    target_len=neg_len, data_map=data_map)

                for l in range(smote_predict.__len__()):
                    for i in range(len(smote_predict[l].attr_list)):
                        if smote_predict[l][i] == 'None':
                            smote_predict[l][i] = vae_predict[l][i]
            else:
                smote_predict = vae_predict

            data_predict = []
            for p in smote_predict:
                res = fill_with_eucli_distance(data_train_total, p, data_map)
                data_predict.append(res)
            data_train_total.extend(data_predict)

    y = [d.data_class for d in data_train_total]
    if using_kdd99:
        y = [get_kdd99_big_classification(c) for c in y]
        # TODO PRINT
        print({big_class: y.count(big_class) for big_class in set(y)})

    data_train_total = [d.discrete_to_num(data_map=data_map).attr_list for d in data_train_total]

    x = np.array(data_train_total).astype(np.float)

    kf = KFold(n_splits=5, shuffle=True)

    ones = copy.deepcopy(negatives) + [positive]
    if using_kdd99:
        ones = list(set([get_kdd99_big_classification(c) for c in ones]))
    ones.sort()

    for one in ones:
        negs = [o for o in ones if o != one]
        print('{} vs others'.format(one))

        TP, TN, FP, FN = 0, 0, 0, 0
        acc = 0
        precision = 0
        for i_train, i_test in kf.split(X=x, y=y):
            train_x = [x[i] for i in i_train]
            train_y = [y[i] for i in i_train]
            test_x = [x[i] for i in i_test]
            test_y = [y[i] for i in i_test]
            # clf = svm.SVC()
            # if expend:
            clf = svm.SVC(kernel='linear')
            clf.fit(train_x, train_y)

            predict_y = [i for i in clf.predict(test_x)]

            TP_k, TN_k, FP_k, FN_k = compute_TP_TN_FP_TN(class_test=test_y, class_predict=predict_y, positive=one,
                                                         negative=negs)
            TP += TP_k
            TN += TN_k
            FP += FP_k
            FN += FN_k

        acc = compute_classification_indicators(TP, TN, FP, FN)[0]
        # TODO PRINT
        # print('{:>2.3f}'.format(TP / (TP + TN + FP + FN)))
        print('acc : {:>2.3f}'.format(acc / 5))
Ejemplo n.º 5
0
def multi_classify(
    data_train,
    positive,
    negatives,
    positive_len,
    negative_lens,
    discrete_num_map,
    data_name,
    expend=False,
):
    if expend:
        print('\033[0;36m--expending' + '-' * 16 + '\033[0m')
        for negative, negative_len in zip(negatives, negative_lens):

            negative_len = math.floor((positive_len - sum([
                l for n, l in zip(negatives, negative_lens)
                if get_kdd99_big_classification(n) ==
                get_kdd99_big_classification(negative)
            ])) * (negative_len / sum([
                l for n, l in zip(negatives, negative_lens)
                if get_kdd99_big_classification(n) ==
                get_kdd99_big_classification(negative)
            ])))

            vae_predict = gen_with_vae(negative, negative_len, data_name)
            smote_predict = gen_with_smote(negative, negative_len, data_name,
                                           discrete_num_map)

            for l in range(smote_predict.__len__()):
                for i in range(len(smote_predict[l].attr_list)):
                    if smote_predict[l][i] == 'None':
                        smote_predict[l][i] = vae_predict[l][i]

            data_predict = []
            for p in smote_predict:
                res = fill_with_eucli_distance(data_train, p, discrete_num_map)
                data_predict.append(res)
            data_train.extend(data_predict)

    X = [d.discrete_to_num(discrete_num_map).attr_list for d in data_train]
    y = [get_kdd99_big_classification(d.data_class) for d in data_train]

    print({big_class: y.count(big_class) for big_class in set(y)})
    kf = KFold(n_splits=5, shuffle=True)

    big_classes = copy.deepcopy(negatives) + [positive]
    big_classes = list(
        set([get_kdd99_big_classification(c) for c in big_classes]))
    big_classes.sort()

    clf = svm.SVC()
    # clf = GaussianNB()
    # clf = tree.DecisionTreeClassifier()
    # clf = KNeighborsClassifier(n_neighbors=3)
    # clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=5),
    #                          algorithm="SAMME",
    #                          n_estimators=200, learning_rate=0.8)
    print('\nusing : ', clf.__class__.__name__)
    for target_class in big_classes:
        print('\033[0;33m {} vs rest \033[0m'.format(target_class))
        rest_classes = [bc for bc in big_classes if bc != target_class]

        acc = None
        for i_train, i_test in kf.split(X, y):
            train_X = [X[i] for i in i_train]
            train_y = [y[i] for i in i_train]
            test_X = [X[i] for i in i_test]
            test_y = [y[i] for i in i_test]

            clf.fit(train_X, train_y)

            predict_y = [i for i in clf.predict(test_X)]

            if acc:
                acc += \
                    compute_classification_indicators(
                        *compute_TP_TN_FP_TN(test_y, predict_y, target_class, rest_classes))[
                        0]
            else:
                acc = \
                    compute_classification_indicators(
                        *compute_TP_TN_FP_TN(test_y, predict_y, target_class, rest_classes))[
                        0]
        print('acc : {:>2.3f}'.format(acc / 5))