Example #1
0
def main(filename, xtrains_percent = 0.2, maxfeature = None, fit_ylabel = False, nn_estimator = 100, sepaLabel = True,
         treeLabel = False, seed = 42, pcaLabel = False, n_comp = 2, sepa2 = False, time_label = False, stream = False,
         sfl = False, anomaly_rate = None, max_samples = None, sample_nor = None):

    inf = float("inf")
    all_start = time.time()
    rng = np.random.RandomState(seed)

    #httpとsmtpのみ別の方法でデータ取得
    if filename == 'C:\Users\Riku Anegawa\Desktop/Dropbox/http.mat' or filename == 'C:\Users\Riku Anegawa\Desktop/Dropbox/smtp.mat':
        mat = {}
        f = h5py.File(filename)
        for k, v in f.items():
            mat[k] = np.array(v)
        X = mat['X'].T
        y2 = mat['y'][0]
        y3 = []
        for i in range(len(y2)):
            y3.append(int(y2[i]))
        y = np.reshape(y3, [len(y3), 1])
    else:
        mat = scipy.io.loadmat(filename)
        X = mat['X']
        y = mat['y']


    rate = xtrains_percent

    if maxfeature == None:
        max_feat = len(X[0])
    else:
        max_feat = int(maxfeature)

    if not treeLabel:
        print('X_train\'s rate : ' + str(rate))
        print('max_features : ' + str(max_feat))
        print('fit_ylabel : ' + str(fit_ylabel))
        print('nn_estimator : ' + str(nn_estimator))
        print('sepaLabel : ' + str(sepaLabel))

    clf = IsolationForest(random_state=rng)
    clf.n_estimators = nn_estimator
    clf.verbose = 0
    clf.max_features = max_feat
    if max_samples != None:
        clf.max_samples = max_samples
    else:
        clf.max_samples = 1.

    # print(X.shape)

    if (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/shuttle.mat'):
        clf.contamination = 0.07

    elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/http.mat'):
        clf.contamination = 0.004

    elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/pima.mat'):
        clf.contamination = 0.35

    elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/mammography.mat'):
        clf.contamination = 0.02

    elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/cover.mat'):
        clf.contamination = 0.009

    elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/breastw.mat'):
        clf.contamination = 0.35

    elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/arrhythmia.mat'):
        clf.contamination = 0.15

    elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/ionosphere.mat'):
        clf.contamination = 0.36

    elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/satellite.mat'):
        clf.contamination = 0.32

    elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/annthyroid.mat'):
        clf.contamination = 0.07

    elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/smtp.mat'):
        clf.contamination = 0.03 / 100

    else:
        raise Exception("error! cannot file it.")

    #交差検証を何回行うか(例:8:2なら5回)
    #もっとうまい方法ありそう
    hoge = 1 / rate
    cross_count = int(np.ceil(hoge))
    if cross_count > hoge:
        cross_count = cross_count - 1


    #cross_count分のauc,acc合計
    sum_auc_roc = 0
    sum_accuracy = 0
    FPR_sum = 0
    TPR_sum = 0
    TNR_sum = 0
    FNR_sum = 0


    pca_fit_time = 0
    pca_transform_train_time = 0
    pca_transform_test_time = 0
    test_time = 0
    fit_time = 0

    #ここはデータを交差検証用に分割するだけ
    if sepaLabel == True:  # separated
        # data cut
        X_anomaly = []
        X_normal = []
        for i in range(len(X)):
            if y[i] == 1:
                X_anomaly.append(X[i])
            else:
                X_normal.append(X[i])

        #データ全体のcontaminationを操作
        #基本使わないゾ!
        zentai = False
        anomaly_rate_all = None
        if zentai:
            if anomaly_rate_all != clf.contamination:
                clf.contamination = anomaly_rate_all
                if anomaly_rate_all < clf.contamination:
                    #異常系をカットしますよ〜〜
                    k = int(np.ceil(len(X_normal) * (anomaly_rate_all / (1 - anomaly_rate_all))))
                    anomaly_hoge = random.sample(X_anomaly, k)  # ランダムに抽出
                    normal_hoge = X_normal
                else:
                    #正常系をカットしましょうね〜〜
                    n_normal = int(len(X_anomaly) / anomaly_rate_all) - len(X_anomaly)
                    normal_rate = n_normal / len(X_normal)
                    k = int(np.ceil(len(X_normal) * normal_rate))
                    normal_hoge = random.sample(X_normal, k)  # ランダムに抽出
                    anomaly_hoge = X_anomaly
                X_anomaly = anomaly_hoge
                X_normal = normal_hoge

        if sample_nor != None:
            X_normal = random.sample(X_normal, sample_nor)
            cont = clf.contamination
            sample_ano = (cont/(1-cont)) * sample_nor
            X_anomaly = random.sample(X_anomaly, int(sample_ano))
            # print("size : " + str(len(X_normal)+len(X_anomaly)))

        cutter_anomaly = len(X_anomaly) * rate
        cutter_normal = len(X_normal) * rate
        X_sepa_ano = []
        X_sepa_nor = []
        for i in range(cross_count):
            head2 = int(cutter_normal * i)
            tail2 = int(cutter_normal * (i+1)) - 1
            X_sepa_nor.append(X_normal[head2:tail2+1])
            # print(len(X_sepa_nor))
            # print(len(X_sepa_nor[0]))
            # print(len(X_sepa_nor[0][0]))

            head = int(cutter_anomaly * i)
            tail = int(cutter_anomaly * (i+1)) - 1
            X_sepa_ano.append(X_anomaly[head:tail+1])
        # print(len(X_sepa_nor))
        # print(len(X_sepa_nor[0]))
        # print(len(X_sepa_ano))
        # print(len(X_sepa_ano[0]))
        # print("")



    else:
        X_sepa = []
        y_sepa = []
        cutter = len(X)*rate
        for i in range(cross_count):
            head = int(cutter*i)
            tail = int(cutter*(i+1))-1
            X_sepa.append(X[head:tail+1])
            y_sepa.append(y[head:tail+1])




    for count in range(cross_count):
        if sepaLabel:
            X_train = []
            X_train_correct = []
            X_test = []
            X_test_correct = []


            # 学習データの異常系含有率を変更
            for i in range(cross_count):
                if i != count:
                    train_flag = True
                    if anomaly_rate is not None:
                        if clf.contamination != anomaly_rate:
                            train_flag = False
                            if clf.contamination != anomaly_rate:
                                all = len(X_sepa_nor[i]) + len(X_sepa_ano[i])
                                cont = len(X_sepa_ano[i]) / all
                                # print(cont)
                                # print(anomaly_rate)
                                if cont > anomaly_rate:# 異常系を減らす
                                    k = int(np.ceil(len(X_sepa_nor[i]) * (anomaly_rate / (1 - anomaly_rate))))
                                    if len(X_sepa_ano[i]) < k:
                                        k = len(X_sepa_ano[i])
                                    anomaly_hoge = random.sample(X_sepa_ano[i], k)  # ランダムに抽出
                                    normal_hoge = X_sepa_nor[i]
                                else:  # 正常系を減らす
                                    k = int(len(X_sepa_ano[i]) / anomaly_rate) - len(X_sepa_ano[i])
                                    normal_hoge = random.sample(X_sepa_nor[i], k)  # ランダムに抽出
                                    anomaly_hoge = X_sepa_ano[i]
                                    # k = int(len(X_sepa_ano[i]) * (1-anomaly_rate)/anomaly_rate)
                                    # normal_rate = n_normal / len(X_sepa_nor[i])
                                    # k = int(np.ceil(len(X_sepa_nor[i]) * normal_rate))
                                    # X_sepa_nor[i] = normal_hoge
                                X_train.extend(anomaly_hoge)
                                # print("aaaaa")
                                # print(X_train)
                                for j in range(len(anomaly_hoge)):
                                    X_train_correct.append(-1)

                                X_train.extend(normal_hoge)
                                for j in range(len(normal_hoge)):
                                    X_train_correct.append(1)

                    if train_flag:
                        X_train.extend(X_sepa_ano[i])
                        for j in range(len(X_sepa_ano[i])):
                            X_train_correct.append(-1)

                        X_train.extend(X_sepa_nor[i])
                        for j in range(len(X_sepa_nor[i])):
                            X_train_correct.append(1)

                else:
                    # X_test.extend(X_sepa_ano[i])
                    # for j in range(len(X_sepa_ano[i])):
                    #     X_test_correct.append(-1)
                    # X_test.extend(X_sepa_nor[i])
                    # for j in range(len(X_sepa_nor[i])):
                    #     X_test_correct.append(1)

                    #テストデータの含有率も変えます??
                    anomaly_rate2 = None
                    test_flag = True
                    if anomaly_rate2 is not None:
                        clf.contamination = anomaly_rate2
                        if clf.contamination != anomaly_rate2:
                            test_flag = False
                            if clf.contamination > anomaly_rate2:  # 異常系を減らす
                                k = int(np.ceil(len(X_sepa_nor[i]) * (anomaly_rate / (1 - anomaly_rate))))
                                anomaly_hoge = random.sample(X_sepa_ano[i], k)  # ランダムに抽出
                                normal_hoge = X_sepa_nor[i]
                                # X_sepa_ano[i] = anomaly_hoge
                                # X_sepa_ano[i] = []
                                # X_sepa_ano[i].extend(anomaly_hoge)
                            else:  # 正常系を減らす
                                n_normal = int(len(X_sepa_ano[i]) / anomaly_rate) - len(X_sepa_ano[i])
                                normal_rate = n_normal / len(X_sepa_nor[i])
                                k = int(np.ceil(len(X_sepa_nor[i]) * normal_rate))
                                normal_hoge = random.sample(X_sepa_nor[i], k)  # ランダムに抽出
                                anomaly_hoge = X_sepa_ano[i]
                                # X_sepa_nor[i] = normal_hoge
                            X_test.extend(anomaly_hoge)
                            for j in range(len(anomaly_hoge)):
                                X_test_correct.append(-1)

                            X_test.extend(normal_hoge)
                            for j in range(len(normal_hoge)):
                                X_test_correct.append(1)

                    if test_flag:
                        X_test.extend(X_sepa_ano[i])
                        for j in range(len(X_sepa_ano[i])):
                            X_test_correct.append(-1)

                        X_test.extend(X_sepa_nor[i])
                        for j in range(len(X_sepa_nor[i])):
                            X_test_correct.append(1)



            #シャッフルするかどうか
            if sfl:
                X_train_set = []
                X_test_set = []
                for i in range(len(X_train)):
                    buf = []
                    buf.append(X_train[i])
                    buf.append(X_train_correct[i])
                    X_train_set.append(buf)

                for i in range(len(X_test)):
                    buf = []
                    buf.append(X_test[i])
                    buf.append(X_test_correct[i])
                    X_test_set.append(buf)

                random.shuffle(X_train_set)
                random.shuffle(X_test_set)

                X_train = []
                X_test = []
                X_train_correct = []
                X_test_correct = []
                for i in range(len(X_train_set)):
                    X_train.append(X_train_set[i][0])
                    X_train_correct.append(X_train_set[i][1])
                for i in range(len(X_test_set)):
                    X_test.append(X_test_set[i][0])
                    X_test_correct.append(X_test_set[i][1])





        else: #mixed
            X_train = []
            X_train_correct = []
            X_test = []
            X_test_correct = []
            for i in range(cross_count):
                if i != count:
                    # print(X_train)
                    X_train.extend(X_sepa[i])
                    X_train_correct.extend(y_sepa[i])
                else:#i == count
                    # print(i, 1111111)
                    X_test.extend(X_sepa[i])
                    X_test_correct.extend(y_sepa[i])

            for q in range(len(X_train_correct)):
                j = X_train_correct[q]
                if (j == 1):
                    X_train_correct[q] = -1
                else:
                    X_train_correct[q] = 1

            for w in range(len(X_test_correct)):
                j = X_test_correct[w]
                if (j == 1):
                    X_test_correct[w] = -1
                else:
                    X_test_correct[w] = 1


        # owari
        # finished cutting data

        if pcaLabel:
            pca_fit_start = time.time()
            pca = PCA(copy=True, iterated_power='auto', n_components=n_comp, random_state=None,
                      svd_solver='auto', tol=0.0, whiten=False)
            # print("X_train: " + str(X_train))
            pca.fit(X_train)
            pca_fit_finish = time.time()

            pca_transform_train_start = time.time()
            X_train = pca.transform(X_train)
            pca_transform_train_finish = time.time()

            clf.max_features = n_comp
            pca_fit_time += (pca_fit_finish - pca_fit_start)
            pca_transform_train_time += (pca_transform_train_finish - pca_transform_train_start)

        #variance and kurtosis
        #尖度や分散の大きな軸を選択したい方はこちら
        varLabel = False
        kurtosisLabel = False
        if varLabel or kurtosisLabel:
            var = []
            kurtosis_set = []
            skewness_set = []
            for i in range(len(X_train[0])):
                data = []
                for j in range(len(X_train)):
                    data.append(X_train[j][i])
                data = np.array(data)
                ave = np.average(data)
                std = np.std(data)
                # if math.isnan(((data - ave) ** 3 / (std ** 3))[0]):
                #     if ave != 0 or std != 0:
                #         print("ave : " + str(ave))
                #         print("std : " + str(std))
                if std == 0:
                    kurtosis = -10000
                else:
                    kurtosis = np.average((data - ave) ** 3) / (std ** 3)
                    skewness = np.average((data - ave) ** 4) / (std ** 4) - 3

                var.append(std)
                kurtosis_set.append(kurtosis)
                skewness_set.append(skewness)
            var_rank = np.argsort(var)[::-1]
            kurtosis_rank = np.argsort(kurtosis_set)[::-1]
            skewness_rank = np.argsort(skewness_set)[::-1] #歪度を使う予定は今のとこないかな〜

            hoge = []
            for i in range(clf.max_features):
                if varLabel:
                    hoge.append(np.array(X_train)[:,var_rank[i]])
                elif kurtosisLabel:
                    hoge.append(np.array(X_train)[:,kurtosis_rank[i]])
            X_train = np.array(hoge).T

        fit_start = time.time()
        #fit_ylabelはFalseで固定
        if fit_ylabel:
            clf.fit(X_train, X_train_correct, sample_weight=None)
        else :
            # print(X_train)
            clf.fit(X_train, y = None, sample_weight=None)
        fit_finish = time.time()
        fit_time += (fit_finish - fit_start)


        if stream:
            sum_score_auc = []
            sum_score_acc = []

            for i in range(len(X_test)):
                if pcaLabel:
                    pca_transform_test_start = time.time()
                    a = [X_test[i]]
                    X_test_pca = pca.transform(a)
                    pca_transform_test_finish = time.time()
                    pca_transform_test_time += (pca_transform_test_finish - pca_transform_test_start)

                else:
                    X_test_pca = [X_test[i]]

                test_start = time.time()
                y_pred_test, a_score = clf.predict(X_test_pca)
                test_finish = time.time()
                test_time += (test_finish - test_start)

                sum_score_auc.append(a_score)
                sum_score_acc.append(y_pred_test)
            a_score = sum_score_auc
            y_pred_test = sum_score_acc

        else: #batch
            if pcaLabel:
                pca_transform_test_start = time.time()
                X_test = pca.transform(X_test)  # stream version
                pca_transform_test_finish = time.time()
                pca_transform_test_time += (pca_transform_test_finish - pca_transform_test_start)

            test_start = time.time()
            y_pred_test, a_score = clf.predict(X_test)
            # a_score = clf.decision_function(X_test)
            test_finish = time.time()
            test_time += (test_finish - test_start)


        acc = calc_accuracy(X_test_correct, y_pred_test, treeLabel)
        AUC_roc = calc_AUC(X_test_correct, a_score, treeLabel)
        FPR, TPR, TNR, FNR = calc_FN(X_test_correct, y_pred_test)
        FNR_sum += FNR
        FPR_sum += FPR
        sum_auc_roc += AUC_roc
        sum_accuracy += acc



    # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # # plot the line, the samples, and the nearest vectors to the plane
    #
    # X_train = np.array(X_train)
    # X_test = np.array(X_test)
    #
    # lim = True
    # x = (-200, 200)
    # y = (-200, 300)
    #
    # for i,j in zip(range(2), [True, False]):
    #     small = j  # trueがsmallestね
    #
    #     plt.subplot(2, 2, i+1)
    #     if small:
    #         plt.title("smallest")
    #     else:
    #         plt.title("largest")
    #
    #     if small:
    #         # b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k')
    #     else:
    #         # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b2],["testing observations"],
    #                loc="upper left")
    #     # plt.legend([b1], ["training observations"],
    #     #            loc="upper left")
    #
    #
    #
    #     plt.subplot(2, 2, i+3)
    #     if small:
    #         b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k')
    #     else:
    #         b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     # plt.legend([b2], ["testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b1], ["training observations"],
    #                loc="upper left")
    # plt.show()ter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k')
    #     else:
    #         # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b2],["testing observations"],
    #                loc="upper left")
    #     # plt.legend([b1], ["training observations"],
    #     #            loc="upper left")
    #
    #
    #
    #     plt.subplot(2, 2, i+3)
    #     if small:
    #         b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k')
    #     else:
    #         b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     # plt.legend([b2], ["testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b1], ["training observations"],
    #                loc="upper left")
    # plt.show()
    # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




    auc2_roc = sum_auc_roc / cross_count
    acc2 = sum_accuracy / cross_count
    fnr = FNR_sum / cross_count
    fpr = FPR_sum / cross_count

    #calc time
    all_finish = time.time()
    all_time = all_finish - all_start
    pca_fit_time = pca_fit_time / cross_count
    pca_transform_train_time = pca_transform_train_time / cross_count
    pca_transform_test_time = pca_transform_test_time / cross_count
    test_time = test_time / cross_count
    fit_time = fit_time / cross_count
    sum_train_time = fit_time + pca_fit_time + pca_transform_train_time
    sum_test_time = pca_transform_test_time + test_time
    # print("sum_train_time : " + str(sum_train_time))
    # print("pca_transform_train_time : " + str(pca_transform_train_time))
    # print("pca_fit_time : " + str(pca_fit_time))
    # print("test_time : " + str(test_time))
    # print("fit_time : " + str(fit_time))
    # print("all_time : " + str(all_time))

    if time_label:
        # return all_time, pca_fit_time + pca_transform_train_time, fit_time, pca_transform_test_time, test_time, sum_train_time, sum_test_time
        return all_time
    elif treeLabel:
        # if math.isnan(auc2_roc):
        #     raise Exception("error! auc is NaN!.")
        # return auc2_roc
        return fnr
        # return fpr

    else:
        return auc2_roc, acc2
Example #2
0
def main(filename,
         xtrains_percent=0.8,
         maxfeature=3,
         fit_ylabel=False,
         nn_estimator=100,
         sepaLabel=True,
         treeLabel=False,
         seed=42,
         pcaLabel=False,
         n_comp=2,
         sepa2=False,
         time_label=False,
         stream=False,
         sfl=False):
    inf = float("inf")
    all_start = time.time()
    rng = np.random.RandomState(seed)

    # httpとsmtpのみ別の方法でデータ取得
    if filename == '/home/anegawa/Dropbox/http.mat' or filename == '/home/anegawa/Dropbox/smtp.mat':
        mat = {}
        f = h5py.File(filename)
        for k, v in f.items():
            mat[k] = np.array(v)
        X = mat['X'].T
        y2 = mat['y'][0]
        y3 = []
        for i in range(len(y2)):
            y3.append(int(y2[i]))
        y = np.reshape(y3, [len(y3), 1])
    else:
        mat = scipy.io.loadmat(filename)
        X = mat['X']
        y = mat['y']

    rate = xtrains_percent
    max_feat = int(maxfeature)
    if max_feat == 3:
        max_feat = X.shape[1]

    if not treeLabel:
        print('X_train\'s rate : ' + str(rate))
        print('max_features : ' + str(max_feat))
        print('fit_ylabel : ' + str(fit_ylabel))
        print('nn_estimator : ' + str(nn_estimator))
        print('sepaLabel : ' + str(sepaLabel))

    clf = IsolationForest(random_state=rng)
    clf.n_estimators = nn_estimator
    clf.verbose = 0
    clf.max_features = max_feat

    if (str(filename) == '/home/anegawa/Dropbox/shuttle.mat'):
        clf.contamination = 0.07

    elif (str(filename) == '/home/anegawa/Dropbox/http.mat'):
        clf.contamination = 0.004

    elif (str(filename) == '/home/anegawa/Dropbox/pima.mat'):
        clf.contamination = 0.35

    elif (str(filename) == '/home/anegawa/Dropbox/mammography.mat'):
        clf.contamination = 0.02

    elif (str(filename) == '/home/anegawa/Dropbox/cover.mat'):
        clf.contamination = 0.009

    elif (str(filename) == '/home/anegawa/Dropbox/breastw.mat'):
        clf.contamination = 0.35

    elif (str(filename) == '/home/anegawa/Dropbox/arrhythmia.mat'):
        clf.contamination = 0.15

    elif (str(filename) == '/home/anegawa/Dropbox/ionosphere.mat'):
        clf.contamination = 0.36

    elif (str(filename) == '/home/anegawa/Dropbox/satellite.mat'):
        clf.contamination = 0.32

    elif (str(filename) == '/home/anegawa/Dropbox/annthyroid.mat'):
        clf.contamination = 0.07

    elif (str(filename) == '/home/anegawa/Dropbox/smtp.mat'):
        clf.contamination = 0.03 / 100

    else:
        raise Exception("error! cannot file it.")

    # 交差検証を何回行うか(例:8:2なら5回)
    # もっとうまい方法ありそう
    hoge = 1 / (1 - rate)
    cross_count = int(np.ceil(hoge))
    if cross_count > hoge:
        cross_count = cross_count - 1

    # cross_count分のauc,acc合計
    sum_auc = 0
    sum_accuracy = 0

    pca_fit_time = 0
    pca_transform_train_time = 0
    pca_transform_test_time = 0
    test_time = 0
    fit_time = 0

    if sepaLabel == True:  # separated
        # data cut
        X_anomaly = []
        X_normal = []
        for i in range(len(X)):
            if y[i] == 1:
                X_anomaly.append(X[i])
            else:
                X_normal.append(X[i])

        cutter_anomaly = int(np.ceil(len(X_anomaly) * rate))
        cutter_normal = int(np.ceil(len(X_normal) * rate))

    for count in range(cross_count):
        if sepaLabel:
            part_anomaly = int(np.ceil(cutter_anomaly * count))
            part_normal = int(np.ceil(cutter_normal * count))
            X_train = []
            X_train_correct = []
            X_test = []
            X_test_correct = []

            for i, k in zip(range(len(X_anomaly)),
                            range(part_anomaly,
                                  part_anomaly + len(X_anomaly))):
                while k >= len(X_anomaly):
                    k = k - len(X_anomaly)

                if i < cutter_anomaly:
                    X_train.append(X_anomaly[k])
                    X_train_correct.append(-1)
                else:
                    X_test.append(X_anomaly[k])
                    X_test_correct.append(-1)

            for i, k in zip(range(len(X_normal)),
                            range(part_normal, part_normal + len(X_normal))):
                while k >= len(X_normal):
                    k = k - len(X_normal)

                if i < cutter_normal:
                    X_train.append(X_normal[k])
                    X_train_correct.append(1)
                else:
                    X_test.append(X_normal[k])
                    X_test_correct.append(1)

            # シャッフルするかどうか
            if sfl:
                X_train_set = []
                X_test_set = []
                for i in range(len(X_train)):
                    buf = []
                    buf.append(X_train[i])
                    buf.append(X_train_correct[i])
                    X_train_set.append(buf)
                for i in range(len(X_test)):
                    buf = []
                    buf.append(X_test[i])
                    buf.append(X_test_correct[i])
                    X_test_set.append(buf)

                random.shuffle(X_train_set)
                random.shuffle(X_test_set)

                X_train = []
                X_test = []
                X_train_correct = []
                X_test_correct = []
                for i in range(len(X_train_set)):
                    X_train.append(X_train_set[i][0])
                    X_train_correct.append(X_train_set[i][1])
                for i in range(len(X_test_set)):
                    X_test.append(X_test_set[i][0])
                    X_test_correct.append(X_test_set[i][1])

        else:  # mixed
            cutter = len(X) * rate
            part = int(np.ceil(cutter * count))

            X_train = []
            X_train_correct = []
            X_test = []
            X_test_correct = []

            for i, k in zip(range(len(X)), range(part, part + len(X))):
                while k >= len(X):
                    k = k - len(X)

                if i < len(X) * rate:
                    X_train.append(X[k])
                    X_train_correct.append(y[k])
                else:
                    X_test.append(X[k])
                    X_test_correct.append(y[k])

            for q in range(len(X_train_correct)):
                j = X_train_correct[q]
                if (j == 1):
                    X_train_correct[q] = -1
                else:
                    X_train_correct[q] = 1

            for w in range(len(X_test_correct)):
                j = X_test_correct[w]
                if (j == 1):
                    X_test_correct[w] = -1
                else:
                    X_test_correct[w] = 1

        # owari
        # finished cutting data

        if pcaLabel:
            if sepa2:
                # if False:
                pca2 = PCA(copy=True,
                           iterated_power='auto',
                           random_state=None,
                           svd_solver='auto',
                           tol=0.0,
                           whiten=False)
                pca2.fit(X_train_normal)
                component = pca2.components_
                component2 = np.sort(pca2.components_)
                if n_comp < len(component2):
                    pca2.components_ = component2[0:n_comp]
                X_train = pca2.transform(X_train)
                X_test = pca2.transform(X_test)

            else:
                pca_fit_start = time.time()
                pca = PCA(copy=True,
                          iterated_power='auto',
                          n_components=n_comp,
                          random_state=None,
                          svd_solver='auto',
                          tol=0.0,
                          whiten=False)
                pca.fit(X_train)
                pca_fit_finish = time.time()

                pca_transform_train_start = time.time()
                X_train = pca.transform(X_train)
                pca_transform_train_finish = time.time()

            clf.max_features = n_comp
            pca_fit_time += (pca_fit_finish - pca_fit_start)
            pca_transform_train_time += (pca_transform_train_finish -
                                         pca_transform_train_start)

        fit_start = time.time()
        # fit_ylabelはFalseで固定
        if fit_ylabel:
            clf.fit(X_train, X_train_correct, sample_weight=None)
        else:
            clf.fit(X_train, y=None, sample_weight=None)
        fit_finish = time.time()
        fit_time += (fit_finish - fit_start)

        if stream:
            sum_score_auc = []
            sum_score_acc = []

            for i in range(len(X_test)):
                if pcaLabel:
                    pca_transform_test_start = time.time()
                    a = [X_test[i]]
                    X_test_pca = pca.transform(a)
                    pca_transform_test_finish = time.time()
                    pca_transform_test_time += (pca_transform_test_finish -
                                                pca_transform_test_start)

                else:
                    X_test_pca = [X_test[i]]

                test_start = time.time()
                y_pred_test, a_score = clf.predict(X_test_pca)
                test_finish = time.time()
                test_time += (test_finish - test_start)

                sum_score_auc.append(a_score)
                sum_score_acc.append(y_pred_test)
            a_score = sum_score_auc
            y_pred_test = sum_score_acc

        else:  # batch
            if pcaLabel:
                pca_transform_test_start = time.time()
                X_test = pca.transform(X_test)  # stream version
                pca_transform_test_finish = time.time()
                pca_transform_test_time += (pca_transform_test_finish -
                                            pca_transform_test_start)

            test_start = time.time()
            y_pred_test, a_score = clf.predict(X_test)
            # a_score = clf.decision_function(X_test)
            test_finish = time.time()
            test_time += (test_finish - test_start)

        acc = calc_accuracy(X_test_correct, y_pred_test, treeLabel)
        AUC = calc_AUC(X_test_correct, a_score, treeLabel)
        sum_auc += AUC
        sum_accuracy += acc

    # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # # plot the line, the samples, and the nearest vectors to the plane
    #
    # X_train = np.array(X_train)
    # X_test = np.array(X_test)
    #
    # lim = True
    # x = (-200, 200)
    # y = (-200, 300)
    #
    # for i,j in zip(range(2), [True, False]):
    #     small = j  # trueがsmallestね
    #
    #     plt.subplot(2, 2, i+1)
    #     if small:
    #         plt.title("smallest")
    #     else:
    #         plt.title("largest")
    #
    #     if small:
    #         # b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k')
    #     else:
    #         # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b2],["testing observations"],
    #                loc="upper left")
    #     # plt.legend([b1], ["training observations"],
    #     #            loc="upper left")
    #
    #
    #
    #     plt.subplot(2, 2, i+3)
    #     if small:
    #         b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k')
    #     else:
    #         b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     # plt.legend([b2], ["testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b1], ["training observations"],
    #                loc="upper left")
    # plt.show()ter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k')
    #     else:
    #         # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b2],["testing observations"],
    #                loc="upper left")
    #     # plt.legend([b1], ["training observations"],
    #     #            loc="upper left")
    #
    #
    #
    #     plt.subplot(2, 2, i+3)
    #     if small:
    #         b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k')
    #     else:
    #         b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     # plt.legend([b2], ["testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b1], ["training observations"],
    #                loc="upper left")
    # plt.show()
    # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    auc2 = sum_auc / cross_count
    acc2 = sum_accuracy / cross_count

    # calc time
    all_finish = time.time()
    all_time = all_finish - all_start
    pca_fit_time = pca_fit_time / cross_count
    pca_transform_train_time = pca_transform_train_time / cross_count
    pca_transform_test_time = pca_transform_test_time / cross_count
    test_time = test_time / cross_count
    fit_time = fit_time / cross_count
    sum_train_time = fit_time + pca_fit_time + pca_transform_train_time
    sum_test_time = pca_transform_test_time + test_time
    # print("sum_train_time : " + str(sum_train_time))
    # print("pca_transform_train_time : " + str(pca_transform_train_time))
    # print("pca_fit_time : " + str(pca_fit_time))
    # print("test_time : " + str(test_time))
    # print("fit_time : " + str(fit_time))
    # print("all_time : " + str(all_time))

    if time_label:
        return all_time, pca_fit_time + pca_transform_train_time, fit_time, pca_transform_test_time, test_time, sum_train_time, sum_test_time
    elif treeLabel:
        if math.isnan(auc2):
            raise Exception("error! auc is NaN!.")
        return auc2
    else:
        return auc2, acc2
Example #3
0
def main(filename,
         xtrains_percent=0.8,
         maxfeature=1,
         fit_ylabel=False,
         nn_estimator=100,
         sepaLabel=True,
         treeLabel=False,
         seed=42,
         pcaLabel=False,
         n_comp=2,
         sepa2=False,
         time_label=False,
         stream=False,
         sfl=False):
    mugen = float("inf")
    all_start = time.time()
    rng = np.random.RandomState(seed)

    # httpとsmtpは別の方法でデータ取得
    if filename == '/home/anegawa/Dropbox/http.mat' or filename == '/home/anegawa/Dropbox/smtp.mat':
        mat = {}
        f = h5py.File(filename)
        for k, v in f.items():
            mat[k] = np.array(v)
        X = mat['X'].T
        y2 = mat['y'][0]
        y3 = []
        for i in range(len(y2)):
            y3.append(int(y2[i]))
        y = np.reshape(y3, [len(y3), 1])

    else:
        mat = scipy.io.loadmat(filename)
        X = mat['X']
        y = mat['y']

    rate = xtrains_percent
    max_feat = int(maxfeature)
    if max_feat == 3:
        max_feat = X.shape[1]

    if treeLabel:
        anegawa = 0
    else:
        print('X_train\'s rate : ' + str(rate))
        print('max_features : ' + str(max_feat))
        print('fit_ylabel : ' + str(fit_ylabel))
        print('nn_estimator : ' + str(nn_estimator))
        print('sepaLabel : ' + str(sepaLabel))

    clf = IsolationForest(random_state=rng)
    clf.n_estimators = nn_estimator
    clf.verbose = 0
    clf.max_features = max_feat

    if (str(filename) == '/home/anegawa/Dropbox/shuttle.mat'):
        clf.contamination = 0.07

    elif (str(filename) == '/home/anegawa/Dropbox/http.mat'):
        clf.contamination = 0.004

    elif (str(filename) == '/home/anegawa/Dropbox/pima.mat'):
        clf.contamination = 0.35

    elif (str(filename) == '/home/anegawa/Dropbox/mammography.mat'):
        clf.contamination = 0.02

    elif (str(filename) == '/home/anegawa/Dropbox/cover.mat'):
        clf.contamination = 0.009

    elif (str(filename) == '/home/anegawa/Dropbox/breastw.mat'):
        clf.contamination = 0.35

    elif (str(filename) == '/home/anegawa/Dropbox/arrhythmia.mat'):
        clf.contamination = 0.15

    elif (str(filename) == '/home/anegawa/Dropbox/ionosphere.mat'):
        clf.contamination = 0.36

    elif (str(filename) == '/home/anegawa/Dropbox/satellite.mat'):
        clf.contamination = 0.32

    elif (str(filename) == '/home/anegawa/Dropbox/annthyroid.mat'):
        clf.contamination = 0.07

    elif (str(filename) == '/home/anegawa/Dropbox/smtp.mat'):
        clf.contamination = 0.03 / 100

    else:
        print('cannot file it.')
        # Generate train data
        a = rng.randn(400, 2)
        X = 0.3 * a
        X_train = np.r_[X + 2, X - 2]
        # X_train = np.ones([400, 2])

        # Generate some regular novel observations
        X = 0.3 * rng.randn(400, 2)
        X_test = np.r_[X + 2, X - 2]
        # X_test = np.ones([400, 2])

        # Generate some abnormal novel observations
        X_outliers = np.random.exponential(1. / 0.001, size=[20, 2])
        # X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
        # X_outliers = np.zeros([20, 2])
        X_test = np.r_[X_test, X_outliers]
        X_train_correct = np.ones([X_train.shape])

    hoge = 1 / (1 - rate)
    cross_count = int(np.ceil(hoge))
    if cross_count > hoge:
        cross_count = cross_count - 1

    sum_auc = 0
    sum_accuracy = 0

    pca_fit_time = 0
    pca_transform_train_time = 0
    pca_transform_test_time = 0
    test_time = 0
    fit_time = 0
    sum_train_time = 0

    # for count in range(cross_count):

    if sepaLabel == True:  # separated
        # data cut
        X_anomaly = []
        X_normal = []
        for i in range(len(X)):
            if y[i] == 1:
                X_anomaly.append(X[i])
            else:
                X_normal.append(X[i])

        cutter_anomaly = int(np.ceil(len(X_anomaly) * rate))
        cutter_normal = int(np.ceil(len(X_normal) * rate))

        for count in range(cross_count):
            part_anomaly = int(np.ceil(cutter_anomaly * count))
            part_normal = int(np.ceil(cutter_normal * count))
            X_train = []
            X_train_correct = []
            X_test = []
            X_test_correct = []

            for i, k in zip(range(len(X_anomaly)),
                            range(part_anomaly,
                                  part_anomaly + len(X_anomaly))):
                while k >= len(X_anomaly):
                    k = k - len(X_anomaly)

                if i < cutter_anomaly:
                    X_train.append(X_anomaly[k])
                    X_train_correct.append(-1)

                else:
                    X_test.append(X_anomaly[k])
                    X_test_correct.append(-1)

            for i, k in zip(range(len(X_normal)),
                            range(part_normal, part_normal + len(X_normal))):
                while k >= len(X_normal):
                    k = k - len(X_normal)

                if i < cutter_normal:
                    X_train.append(X_normal[k])
                    X_train_correct.append(1)
                else:
                    X_test.append(X_normal[k])
                    X_test_correct.append(1)

            if sfl:
                X_train_set = []
                X_test_set = []
                for i in range(len(X_train)):
                    buf = []
                    buf.append(X_train[i])
                    buf.append(X_train_correct[i])
                    X_train_set.append(buf)
                for i in range(len(X_test)):
                    buf = []
                    buf.append(X_test[i])
                    buf.append(X_test_correct[i])
                    X_test_set.append(buf)

                random.shuffle(X_train_set)
                random.shuffle(X_test_set)

                X_train = []
                X_test = []
                X_train_correct = []
                X_test_correct = []
                for i in range(len(X_train_set)):
                    X_train.append(X_train_set[i][0])
                    X_train_correct.append(X_train_set[i][1])
                for i in range(len(X_test_set)):
                    X_test.append(X_test_set[i][0])
                    X_test_correct.append(X_test_set[i][1])

    else:  # mixed
        cutter = len(X) * rate  # test start this index at the first time
        for count in range(cross_count):
            part = int(np.ceil(cutter * count))
            # while part >= len(X):
            #     part = part - len(X)
            X_train = []
            X_train_correct = []
            X_test = []
            X_test_correct = []

            for i, k in zip(range(len(X)), range(part, part + len(X))):
                while k >= len(X):
                    k = k - len(X)

                if i < len(X) * rate:
                    X_train.append(X[k])
                    X_train_correct.append(y[k])

                else:
                    X_test.append(X[k])
                    X_test_correct.append(y[k])

            for q in range(len(X_train_correct)):
                j = X_train_correct[q]
                if (j == 1):
                    X_train_correct[q] = -1
                else:
                    X_train_correct[q] = 1

            for w in range(len(X_test_correct)):
                j = X_test_correct[w]
                if (j == 1):
                    X_test_correct[w] = -1
                else:
                    X_test_correct[w] = 1

        # owari
        # finished cutting data

        if pcaLabel:
            pca_fit_start = time.time()
            pca = PCA(copy=True,
                      iterated_power='auto',
                      n_components=n_comp,
                      random_state=None,
                      svd_solver='auto',
                      tol=0.0,
                      whiten=False)
            pca2 = PCA(copy=True,
                       iterated_power='auto',
                       random_state=None,
                       svd_solver='auto',
                       tol=0.0,
                       whiten=False)

            if sepa2:
                # if False:
                print("こっち入ってるけどええんか!?")
                pca2.fit(X_train_normal)
                component = pca2.components_
                component2 = np.sort(pca2.components_)
                if n_comp < len(component2):
                    pca2.components_ = component2[0:n_comp]
                    # print(pca2.components_.shape)
                X_train = pca2.transform(X_train)
                X_test = pca2.transform(X_test)

            else:
                pca.fit(X_train)
                pca_fit_finish = time.time()

                pca_transform_train_start = time.time()
                X_train = pca.transform(X_train)
                pca_transform_train_finish = time.time()

                # a = X_test[0]
                # X_test = pca.transform(a)

                # if not stream:
                #     pca_transform_test_start = time.time()
                #     X_test = pca.transform(X_test) #stream version
                #     pca_transform_test_finish = time.time()
                #     pca_transform_test_time += (pca_transform_test_finish - pca_transform_test_start)
            clf.max_features = n_comp
            pca_fit_time += (pca_fit_finish - pca_fit_start)
            pca_transform_train_time += (pca_transform_train_finish -
                                         pca_transform_train_start)

        fit_start = time.time()
        if fit_ylabel:
            clf.fit(X_train, X_train_correct, sample_weight=None)
        else:
            clf.fit(X_train, y=None, sample_weight=None)
        fit_finish = time.time()
        fit_time += (fit_finish - fit_start)

        # if pcaLabel and stream:
        if stream:
            sum_score_auc = []
            sum_score_acc = []

            # print(X_test[0:1])
            for i in range(len(X_test)):
                if pcaLabel:
                    pca_transform_test_start = time.time()
                    a = [X_test[i]]
                    X_test_pca = pca.transform(a)
                    pca_transform_test_finish = time.time()
                    pca_transform_test_time += (pca_transform_test_finish -
                                                pca_transform_test_start)

                else:
                    X_test_pca = [X_test[i]]

                test_start = time.time()
                y_pred_test, a_score = clf.predict(X_test_pca)
                test_finish = time.time()
                test_time += (test_finish - test_start)

                sum_score_auc.append(a_score)
                sum_score_acc.append(y_pred_test)
            a_score = sum_score_auc
            y_pred_test = sum_score_acc

        else:
            if pcaLabel:
                pca_transform_test_start = time.time()
                X_test = pca.transform(X_test)  # stream version
                pca_transform_test_finish = time.time()
                pca_transform_test_time += (pca_transform_test_finish -
                                            pca_transform_test_start)

            test_start = time.time()
            y_pred_test, a_score = clf.predict(X_test)
            test_finish = time.time()
            test_time += (test_finish - test_start)
        # a_score = clf.decision_function(X_test)

        acc = calc_accuracy(X_test_correct, y_pred_test, treeLabel)
        AUC = calc_AUC(X_test_correct, a_score, treeLabel)
        sum_auc += AUC
        sum_accuracy = acc

    # return AUC

    # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # # plot the line, the samples, and the nearest vectors to the plane
    # xx, yy = np.meshgrid(np.linspace(-200, 200, 1000), np.linspace(-200, 200, 1000))
    # # clf.max_features = 2
    # # print(yy.ravel())
    # # Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    #
    # # Z = Z.reshape(xx.shape)
    #
    # plt.figure(figsize=(100, 200))
    # plt.suptitle("satellite")
    # # plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
    #
    # X_train = np.array(X_train)
    # X_test = np.array(X_test)
    #
    # lim = True
    # x = (-200, 200)
    # y = (-200, 300)
    #
    # for i,j in zip(range(2), [True, False]):
    #     small = j  # trueがsmallestね
    #
    #     plt.subplot(2, 2, i+1)
    #     if small:
    #         plt.title("smallest")
    #     else:
    #         plt.title("largest")
    #
    #     if small:
    #         # b1 = plt.scat
    # # plot the line, the samples, and the nearest vectors to the plane
    # xx, yy = np.meshgrid(np.linspace(-200, 200, 1000), np.linspace(-200, 200, 1000))
    # # clf.max_features = 2
    # # print(yy.ravel())
    # # Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    #
    # # Z = Z.reshape(xx.shape)
    #
    # plt.figure(figsize=(100, 200))
    # plt.suptitle("satellite")
    # # plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
    #
    # X_train = np.array(X_train)
    # X_test = np.array(X_test)
    #
    # lim = True
    # x = (-200, 200)
    # y = (-200, 300)
    #
    # for i,j in zip(range(2), [True, False]):
    #     small = j  # trueがsmallestね
    #
    #     plt.subplot(2, 2, i+1)
    #     if small:
    #         plt.title("smallest")
    #     else:
    #         plt.title("largest")
    #
    #     if small:
    #         # b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k')
    #     else:
    #         # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b2],["testing observations"],
    #                loc="upper left")
    #     # plt.legend([b1], ["training observations"],
    #     #            loc="upper left")
    #
    #
    #
    #     plt.subplot(2, 2, i+3)
    #     if small:
    #         b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k')
    #     else:
    #         b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     # plt.legend([b2], ["testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b1], ["training observations"],
    #                loc="upper left")
    # plt.show()ter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k')
    #     else:
    #         # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b2],["testing observations"],
    #                loc="upper left")
    #     # plt.legend([b1], ["training observations"],
    #     #            loc="upper left")
    #
    #
    #
    #     plt.subplot(2, 2, i+3)
    #     if small:
    #         b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k')
    #     else:
    #         b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     # plt.legend([b2], ["testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b1], ["training observations"],
    #                loc="upper left")
    # plt.show()
    # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    auc2 = sum_auc / cross_count
    # print(sum_accuracy)
    acc2 = sum_accuracy / cross_count

    # calc time
    all_finish = time.time()
    all_time = all_finish - all_start
    pca_fit_time = pca_fit_time / cross_count
    pca_transform_train_time = pca_transform_train_time / cross_count
    pca_transform_test_time = pca_transform_test_time / cross_count
    test_time = test_time / cross_count
    fit_time = fit_time / cross_count
    sum_train_time = fit_time + pca_fit_time + pca_transform_train_time
    sum_test_time = pca_transform_test_time + test_time
    # print("sum_train_time : " + str(sum_train_time))
    # print("pca_transform_train_time : " + str(pca_transform_train_time))
    # print("pca_fit_time : " + str(pca_fit_time))
    # print("test_time : " + str(test_time))
    # print("fit_time : " + str(fit_time))
    # print("all_time : " + str(all_time))
    # return

    if time_label:
        return all_time, pca_fit_time + pca_transform_train_time, fit_time, pca_transform_test_time, test_time, sum_train_time, sum_test_time
    elif treeLabel:
        if math.isnan(auc2):
            majikayo = True
        return auc2
    else:
        return auc2, acc2