Exemple #1
0
    def test_normalization(self):
        norm_X_train, norm_X_test = standardizer(self.X_train, self.X_train)
        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)

        assert_allclose(norm_X_test.mean(), 0, atol=0.05)
        assert_allclose(norm_X_test.std(), 1, atol=0.05)

        # test when X_t is not presented
        norm_X_train = standardizer(self.X_train)
        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)
Exemple #2
0
    def test_normalization(self):

        # test when X_t is presented and no scalar
        norm_X_train, norm_X_test = standardizer(self.X_train, self.X_test)
        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)

        assert_allclose(norm_X_test.mean(), 0, atol=0.05)
        assert_allclose(norm_X_test.std(), 1, atol=0.05)

        # test when X_t is not presented and no scalar
        norm_X_train = standardizer(self.X_train)
        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)

        # test when X_t is presented and the scalar is kept
        norm_X_train, norm_X_test, scalar = standardizer(self.X_train,
                                                         self.X_test,
                                                         keep_scalar=True)

        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)

        assert_allclose(norm_X_test.mean(), 0, atol=0.05)
        assert_allclose(norm_X_test.std(), 1, atol=0.05)

        if not hasattr(scalar, 'fit') or not hasattr(scalar, 'transform'):
            raise AttributeError("%s is not a detector instance." % (scalar))

        # test when X_t is not presented and the scalar is kept
        norm_X_train, scalar = standardizer(self.X_train, keep_scalar=True)

        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)

        if not hasattr(scalar, 'fit') or not hasattr(scalar, 'transform'):
            raise AttributeError("%s is not a detector instance." % (scalar))

        # test shape difference
        with assert_raises(ValueError):
            standardizer(self.X_train, self.X_test_diff)
Exemple #3
0
    def test_normalization(self):

        # test when X_t is presented and no scalar
        norm_X_train, norm_X_test = standardizer(self.X_train, self.X_test)
        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)

        assert_allclose(norm_X_test.mean(), 0, atol=0.05)
        assert_allclose(norm_X_test.std(), 1, atol=0.05)

        # test when X_t is not presented and no scalar
        norm_X_train = standardizer(self.X_train)
        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)

        # test when X_t is presented and the scalar is kept
        norm_X_train, norm_X_test, scalar = standardizer(self.X_train,
                                                         self.X_test,
                                                         keep_scalar=True)

        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)

        assert_allclose(norm_X_test.mean(), 0, atol=0.05)
        assert_allclose(norm_X_test.std(), 1, atol=0.05)

        if not hasattr(scalar, 'fit') or not hasattr(scalar, 'transform'):
            raise AttributeError("%s is not a detector instance." % (scalar))

        # test when X_t is not presented and the scalar is kept
        norm_X_train, scalar = standardizer(self.X_train, keep_scalar=True)

        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)

        if not hasattr(scalar, 'fit') or not hasattr(scalar, 'transform'):
            raise AttributeError("%s is not a detector instance." % (scalar))

        # test shape difference
        with assert_raises(ValueError):
            standardizer(self.X_train, self.X_test_diff)
Exemple #4
0
    for t in range(n_ite):
        print('\nn_ite', t + 1, data)  # print status

        random_state = np.random.RandomState(t)

        # split the data into training and testing
        X_train, X_test, y_train, y_test = train_test_split(
            X_orig, y_orig, test_size=test_size, random_state=random_state)
        # in case of small datasets
        if k_max > X_train.shape[0]:
            k_max = X_train.shape[0]
        k_list = random_state.randint(k_min, k_max, size=n_clf).tolist()

        # normalized the data
        X_train_norm, X_test_norm = standardizer(X_train, X_test)

        train_scores = np.zeros([X_train.shape[0], n_clf])
        test_scores = np.zeros([X_test.shape[0], n_clf])

        # initialized the list to store the results
        test_target_list = []
        method_list = []

        # generate a pool of detectors and predict on test instances
        train_scores, test_scores = train_predict_lof(k_list, X_train_norm,
                                                      X_test_norm,
                                                      train_scores,
                                                      test_scores)

        #######################################################################
Exemple #5
0
def train_and_predict(data_name):
    # reference pearson size:
    # https://www.researchgate.net/post/What_is_the_minimum_sample_size_to_run_Pearsons_R
    loc_region_size = 0
    loc_region_min = 30  # min local region size
    loc_region_max = 100  # max local region size
    ###############################################################################
    # adjustable parameters
    loc_region_perc = 0.1
    loc_region_ite = 20  # the number of iterations in defining local region
    loc_region_threshold = int(loc_region_ite /
                               2)  # the threshold to keep a point
    loc_min_features = 0.5  # the lower bound of the number of features to use

    n_bins = 10
    n_selected = 1  # actually not a parameter to tweak

    n_clf = 50
    k_min = 5
    k_max = 200

    # for SG_AOM and SG_MOA, choose the right number of buckets
    n_buckets = 5
    n_clf_bucket = int(n_clf / n_buckets)
    assert (n_clf % n_buckets == 0)  # in case wrong number of buckets

    # flag for printing and output saving
    verbose = True

    # record of feature bagging detector
    fb_n_neighbors = []
    ###############################################################################

    start_time = time.time()
    #data_name = 'pageblocks'
    xy_data = pd.read_csv(data_name)
    X_orig, y_orig = xy_data.iloc[:, :-1], xy_data.iloc[:, -1]

    # initialize the matrix for storing scores
    roc_mat = np.zeros([n_ite, n_baselines])  # receiver operating curve
    ap_mat = np.zeros([n_ite, n_baselines])  # average precision

    for t in range(n_ite):
        print('\nn_ite', t + 1, data_name)  # print status

        random_state = np.random.RandomState()

        # split the data into training and testing
        X_train, X_test, y_train, y_test = train_test_split(
            X_orig, y_orig, test_size=test_size, random_state=random_state)
        # in case of small datasets
        if k_max > X_train.shape[0]:
            k_max = X_train.shape[0]
        k_list = random_state.randint(k_min, k_max, size=n_clf).tolist()
        k_list.sort()

        # normalized the data
        X_train_norm, X_test_norm = standardizer(X_train, X_test)

        train_scores = np.zeros([X_train.shape[0], n_clf])
        test_scores = np.zeros([X_test.shape[0], n_clf])

        # initialized the list to store the results
        test_target_list = []
        method_list = []

        # generate a pool of detectors and predict on test instances
        train_scores, test_scores = train_predict_lof(k_list, X_train_norm,
                                                      X_test_norm,
                                                      train_scores,
                                                      test_scores)

        #######################################################################
        # fit feature bagging using median of k_list
        # n_neighbors = int(np.median(k_list))
        n_neighbors = random_state.randint(low=k_min, high=k_max)
        clf = FeatureBagging(base_estimator=LOF(n_neighbors=n_neighbors),
                             n_estimators=len(k_list),
                             check_estimator=False)
        print(clf)
        fb_n_neighbors.append(n_neighbors)
        clf.fit(X_train_norm)

        #######################################################################
        # generate normalized scores
        train_scores_norm, test_scores_norm = standardizer(
            train_scores, test_scores)
        # generate mean and max outputs
        # SG_A and SG_M
        target_test_mean = np.mean(test_scores_norm, axis=1)
        target_test_max = np.max(test_scores_norm, axis=1)
        test_target_list.extend([target_test_mean, target_test_max])
        method_list.extend(['GG_a', 'GG_m'])

        # calculate every detector information entropy
        info_ent_arry = calc_avg_nmi(train_scores)
        # generate information entropy weighted mean
        # 使用信息熵加权计算
        target_test_entropy_weighted_pear = np.mean(
            test_scores_norm * info_ent_arry.reshape(1, -1), axis=1)

        test_target_list.append(target_test_entropy_weighted_pear)
        method_list.append('GG_a_ent')

        # generate weighted mean
        target_test_weighted_pear_max_ent = np.max(
            test_scores_norm * info_ent_arry.reshape(1, -1), axis=1)

        test_target_list.append(target_test_weighted_pear_max_ent)
        method_list.append('GG_m_ent')

        # generate pseudo target for training -> for calculating weights
        target_mean = np.mean(train_scores_norm, axis=1).reshape(-1, 1)
        target_max = np.max(train_scores_norm, axis=1).reshape(-1, 1)

        # 使用信息熵加权后的pseudo
        target_ent_mean = np.mean(train_scores_norm *
                                  info_ent_arry.reshape(1, -1),
                                  axis=1).reshape(-1, 1)
        target_ent_max = np.max(train_scores_norm *
                                info_ent_arry.reshape(1, -1),
                                axis=1).reshape(-1, 1)

        # generate average of maximum (SG_AOM) and maximum of average (SG_MOA)
        target_test_aom = aom(test_scores_norm, n_buckets, n_clf)

        target_test_aom_ent = aom(
            test_scores_norm * info_ent_arry.reshape(1, -1), n_buckets, n_clf)

        test_target_list.extend([target_test_aom, target_test_aom_ent])
        method_list.extend(['GG_aom', 'GG_aom_ent'])
        ##################################################################

        # define the local region size
        loc_region_size = int(X_train_norm.shape[0] * loc_region_perc)
        if loc_region_size < loc_region_min:
            loc_region_size = loc_region_min
        if loc_region_size > loc_region_max:
            loc_region_size = loc_region_max

        # define local region
        ind_arr = get_local_region(X_train_norm,
                                   X_test_norm,
                                   loc_region_size,
                                   loc_region_ite=loc_region_ite,
                                   local_region_strength=loc_region_threshold,
                                   loc_min_features=loc_min_features,
                                   random_state=random_state)

        pred_scores_best = np.zeros([
            X_test.shape[0],
        ])
        pred_scores_ens = np.zeros([
            X_test.shape[0],
        ])
        pred_scores_best_ent = np.zeros([
            X_test.shape[0],
        ])
        pred_scores_ens_ent = np.zeros([
            X_test.shape[0],
        ])

        for i in range(X_test.shape[0]):  # iterate all test instance

            ind_k = ind_arr[i]

            # get the pseudo target: mean
            target_k = target_mean[ind_k, ].ravel()

            target_ent_k = target_ent_mean[ind_k, ].ravel()

            # get the current scores from all clf
            curr_train_k = train_scores_norm[ind_k, :]

            # initialize containers for correlation
            corr_pear_n = np.zeros([
                n_clf,
            ])
            corr_pear_n_ent = np.zeros([
                n_clf,
            ])

            for d in range(n_clf):
                corr_pear_n[d, ] = pearsonr(target_k, curr_train_k[:, d])
                corr_pear_n_ent[d, ] = pearsonr(
                    target_ent_k, curr_train_k[:, d])  #*info_ent_arry[d]
                #corr_pear_n_ent[d,] = calc_nmi(target_k,curr_train_k[:,d])
            # pick the best one
            best_clf_ind = np.nanargmax(corr_pear_n)
            pred_scores_best[i, ] = test_scores_norm[i, best_clf_ind]

            best_clf_ind_ent = np.nanargmax(corr_pear_n_ent)
            pred_scores_best_ent[i, ] = test_scores_norm[i, best_clf_ind_ent]

        test_target_list.extend([pred_scores_best, pred_scores_best_ent])
        method_list.extend(['LSCP_a', 'LSCP_a_ent'])
        ######################################################################

        pred_scores_best = np.zeros([
            X_test.shape[0],
        ])
        pred_scores_ens = np.zeros([
            X_test.shape[0],
        ])

        pred_scores_best_ent = np.zeros([
            X_test.shape[0],
        ])
        pred_scores_ens_ent = np.zeros([
            X_test.shape[0],
        ])

        for i in range(X_test.shape[0]):  # iterate all test instance
            # get the neighbor idx of the current point
            ind_k = ind_arr[i]
            # get the pseudo target: mean
            target_k = target_max[ind_k, ].ravel()
            target_k_ent = target_ent_max[ind_k, ].ravel()

            # get the current scores from all clf
            curr_train_k = train_scores_norm[ind_k, :]

            # initialize containers for correlation
            corr_pear_n = np.zeros([
                n_clf,
            ])
            corr_pear_n_ent = np.zeros([
                n_clf,
            ])

            for d in range(n_clf):
                corr_pear_n[d, ] = pearsonr(target_k, curr_train_k[:, d])
                corr_pear_n_ent[d, ] = pearsonr(
                    target_k_ent, curr_train_k[:, d])  #*info_ent_arry[d]
                #corr_pear_n_ent[d,] = calc_nmi(target_k,curr_train_k[:,d])
            # pick the best one
            best_clf_ind = np.nanargmax(corr_pear_n)
            pred_scores_best[i, ] = test_scores_norm[i, best_clf_ind]

            pred_scores_ens[i, ] = np.mean(test_scores_norm[
                i, get_competent_detectors(corr_pear_n, n_bins, n_selected)])

            # 使用信息熵加权后的pseudo
            best_clf_ind_ent = np.nanargmax(corr_pear_n_ent)
            pred_scores_best_ent[i, ] = test_scores_norm[i, best_clf_ind_ent]

            pred_scores_ens_ent[i, ] = np.mean(test_scores_norm[
                i,
                get_competent_detectors(corr_pear_n_ent, n_bins, n_selected)])

        test_target_list.extend([
            pred_scores_best, pred_scores_best_ent, pred_scores_ens,
            pred_scores_ens_ent
        ])
        method_list.extend(
            ['LSCP_m', 'LSCP_m_ent', 'LSCP_aom', 'LSCP_aom_ent'])

        ######################################################################

        # store performance information and print result
        for i in range(n_baselines):
            roc_mat[t, i] = roc_auc_score(y_test, test_target_list[i])
            ap_mat[t, i] = average_precision_score(y_test, test_target_list[i])
            print(method_list[i], roc_mat[t, i])
        print('local region size:', loc_region_size)

    print("--- %s seconds ---" % (time.time() - start_time))
    execution_time = time.time() - start_time

    # save parameters
    save_script(data_name,
                base_detector,
                timestamp,
                n_ite,
                test_size,
                n_baselines,
                loc_region_perc,
                loc_region_ite,
                loc_region_threshold,
                loc_min_features,
                loc_region_size,
                loc_region_min,
                loc_region_max,
                n_clf,
                k_min,
                k_max,
                n_bins,
                n_selected,
                n_buckets,
                fb_n_neighbors,
                execution_time,
                res_path='results_avg_nmi')

    # print and save the result
    # default location is /results/***.csv
    print_save_result(data_name,
                      base_detector,
                      n_baselines,
                      roc_mat,
                      ap_mat,
                      method_list,
                      timestamp,
                      verbose,
                      res_path='results_avg_nmi')