Beispiel #1
0
def gen_with_smote(target_class, target_num, data_name, discrete_num_map):
    smote = MySmote(data_name,
                    target_class=target_class,
                    data_map=discrete_num_map)
    smote_predict = smote.predict(target_len=target_num,
                                  data_map=discrete_num_map)

    return smote_predict
Beispiel #2
0
def multi_classify(data_train_total, positive, negatives, pos_len, negs_len, data_map, data_name, expend=False,
                   using_kdd99=False):
    if expend:
        print('-' * 16 + 'expending' + '-' * 16)
        for negative, neg_len in zip(negatives, negs_len):
            neg_len = pos_len - neg_len
            vae_predict = gen_with_vae(negative, neg_len, data_name)

            if using_kdd99:
                smote = MySmote(data_name, target_class=negative, data_map=data_map)
                smote_predict = smote.predict(
                    target_len=neg_len, data_map=data_map)

                for l in range(smote_predict.__len__()):
                    for i in range(len(smote_predict[l].attr_list)):
                        if smote_predict[l][i] == 'None':
                            smote_predict[l][i] = vae_predict[l][i]
            else:
                smote_predict = vae_predict

            data_predict = []
            for p in smote_predict:
                res = fill_with_eucli_distance(data_train_total, p, data_map)
                data_predict.append(res)
            data_train_total.extend(data_predict)

    y = [d.data_class for d in data_train_total]
    if using_kdd99:
        y = [get_kdd99_big_classification(c) for c in y]
        # TODO PRINT
        print({big_class: y.count(big_class) for big_class in set(y)})

    data_train_total = [d.discrete_to_num(data_map=data_map).attr_list for d in data_train_total]

    x = np.array(data_train_total).astype(np.float)

    kf = KFold(n_splits=5, shuffle=True)

    ones = copy.deepcopy(negatives) + [positive]
    if using_kdd99:
        ones = list(set([get_kdd99_big_classification(c) for c in ones]))
    ones.sort()

    for one in ones:
        negs = [o for o in ones if o != one]
        print('{} vs others'.format(one))

        TP, TN, FP, FN = 0, 0, 0, 0
        acc = 0
        precision = 0
        for i_train, i_test in kf.split(X=x, y=y):
            train_x = [x[i] for i in i_train]
            train_y = [y[i] for i in i_train]
            test_x = [x[i] for i in i_test]
            test_y = [y[i] for i in i_test]
            # clf = svm.SVC()
            # if expend:
            clf = svm.SVC(kernel='linear')
            clf.fit(train_x, train_y)

            predict_y = [i for i in clf.predict(test_x)]

            TP_k, TN_k, FP_k, FN_k = compute_TP_TN_FP_TN(class_test=test_y, class_predict=predict_y, positive=one,
                                                         negative=negs)
            TP += TP_k
            TN += TN_k
            FP += FP_k
            FN += FN_k

        acc = compute_classification_indicators(TP, TN, FP, FN)[0]
        # TODO PRINT
        # print('{:>2.3f}'.format(TP / (TP + TN + FP + FN)))
        print('acc : {:>2.3f}'.format(acc / 5))
Beispiel #3
0
def binary_classify(data_train_total, positive, negative, pos_len, neg_len, expend, data_map, data_name,
                    using_kdd99=False, vae_only=False):
    if expend:
        print('expending-------------------------------------------------------')
        neg_len = pos_len - neg_len
        # vae_predict = gen_with_multi_vae(negative, neg_len, data_name)
        vae_predict = gen_with_vae(negative, neg_len, data_name)

        if using_kdd99:
            smote = MySmote(data_name, target_class=negative, data_map=data_map)
            smote_predict = smote.predict(
                target_len=neg_len, data_map=data_map)

            for l in range(smote_predict.__len__()):
                for i in range(len(smote_predict[l].attr_list)):
                    if smote_predict[l][i] == 'None':
                        smote_predict[l][i] = vae_predict[l][i]
        else:
            smote_predict = vae_predict

        data_predict = []
        for p in smote_predict:
            res = fill_with_eucli_distance(data_train_total, p, data_map)
            data_predict.append(res)
        data_train_total.extend(data_predict)

    y = [d.data_class for d in data_train_total]
    if using_kdd99:
        positive = get_kdd99_big_classification(positive)
        negative = get_kdd99_big_classification(negative)
        y = [get_kdd99_big_classification(c) for c in y]

    if vae_only:
        data_train_total = [d.discrete_to_num(data_map=data_map).to_list([DataType.CONTINUOUS]) for d in
                            data_train_total]
    data_train_total = [d.discrete_to_num(data_map=data_map).attr_list for d in data_train_total]
    # Todo:
    x = np.array(data_train_total).astype(np.float)

    print({k: y.count(k) for k in y})

    # Todo: kf
    kf = KFold(n_splits=5, shuffle=True)
    args = []

    for i_train, i_test in kf.split(X=x, y=y):
        train_x = [x[i] for i in i_train]
        train_y = [y[i] for i in i_train]
        test_x = [x[i] for i in i_test]
        test_y = [y[i] for i in i_test]
        clf = svm.SVC(kernel='linear', probability=True,
                      random_state=np.random.RandomState(0))
        # clf = svm.SVC()
        # clf = GaussianNB()
        # clf = tree.DecisionTreeClassifier()
        clf.fit(train_x, train_y)

        predict_y = [i for i in clf.predict(test_x)]

        temp = compute_classification_indicators(*compute_TP_TN_FP_TN(test_y, predict_y, positive, negative))
        if args.__len__() == 0:
            args = temp
        else:
            args = [a + t for a, t in zip(args, temp)]
    # print(
    #     u'acc\u208A: {:>2.3f}, acc\u208B: {:>2.3f}, accuracy: {:>2.3f}, precision: {:>2.3f}, recall: {:>2.3f}, F1: {:>2.3f}, G-mean: {:>2.3f}'
    #         .format(*[a / 5 for a in args]))
    print(
        u'acc+: {:>2.3f}, acc-: {:>2.3f}, accuracy: {:>2.3f}, precision: {:>2.3f}, recall: {:>2.3f}, F1: {:>2.3f}, G-mean: {:>2.3f}'
            .format(*[a / 5 for a in args]))
    print('')