Esempio n. 1
0
    def test_normalization(self):
        norm_X_train, norm_X_test = standardizer(self.X_train, self.X_train)
        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)

        assert_allclose(norm_X_test.mean(), 0, atol=0.05)
        assert_allclose(norm_X_test.std(), 1, atol=0.05)

        # test when X_t is not presented
        norm_X_train = standardizer(self.X_train)
        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)
Esempio n. 2
0
    def filter(self):
        bd = pd.read_pickle(self.filename)
        #data=load("/home/has/Airline/dm-pfe-hm/d","rb")
        #bd

        df = pd.DataFrame(bd)
        df = df.loc[df['details_agent'] == self.agent_ref]
        df['B_dept'] = (df.details_flights_departure -
                        df.details_validation_at)
        #df.dropna()
        df['B_dept'] = df['B_dept'] / np.timedelta64(1, 'h')
        #df=df[df.details_status =='TKTT']
        df['d'] = df.details_validation_at.dt.date
        df['t'] = df.details_validation_at.dt.time
        df = df[df.details_status == 'TKTT']
        df['B_dept'] = round(df.B_dept, 0)
        df = df.drop(df[df.B_dept < 0].index)
        df = df.drop(df[df.details_price < 400].index)
        X = df.iloc[:, [2, 5]].values
        #from sklearn.preprocessing import MinMaxScaler
        #scaler = MinMaxScaler()
        #X=scaler.fit_transform(X)

        X = standardizer(X)
        # self.x=X
        return X
Esempio n. 3
0
    def _create_scores(self, X):
        """Internal function to generate and combine scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        Returns
        -------
        agg_score: numpy array of shape (n_samples,)
            Aggregated scores.
        """
        all_scores = np.zeros([X.shape[0], self.n_base_estimators_])

        for i, clf in enumerate(self.base_estimators):
            if hasattr(clf, 'decision_function'):
                all_scores[:, i] = clf.decision_function(X)
            else:
                raise ValueError(
                    "{clf} does not have decision_function.".format(clf=clf))

        if self.standardization:
            all_scores = standardizer(all_scores)
        if self.method == 'average':
            agg_score = average(all_scores, estimator_weights=self.weights)
        if self.method == 'maximization':
            agg_score = maximization(all_scores)
        if self.method == 'median':
            agg_score = median(all_scores)

        return agg_score
    def fit(self, X, contamination=0.01):
        """
        Fit detector

        Args:
            X: pd.DataFrame
        """
        self.detectors = {
            "auto_encoder":
            AutoEncoder(
                epochs=256,
                validation_size=0,
                preprocessing=False,
                verbose=0,
                contamination=contamination,
            ),
        }
        # print("train_data.shape:", X.shape)
        # 数据预处理
        # 标准化
        X_train_norm, self.data_norm_scalar = standardizer(X, keep_scalar=True)
        # 归一化
        X_train_unif, self.data_unif_scalar = minmaxizer(X_train_norm,
                                                         keep_scalar=True)

        train_scores = np.zeros([X.shape[0], len(self.detectors)])
        thresholds = np.zeros([1, len(self.detectors)])
        # 训练
        for i, clf_name in enumerate(self.detectors):
            clf = self.detectors[clf_name]
            clf.fit(X_train_unif)
            train_scores[:, i] = clf.decision_scores_
            thresholds[:, i] = clf.threshold_
        # 训练集异常程度及阈值
        train_scores_norm, self.score_scalar = standardizer(train_scores,
                                                            keep_scalar=True)
        thresholds_norm = self.score_scalar.transform(thresholds)

        self.decision_scores = pd.DataFrame(average(train_scores_norm),
                                            index=X.index)
        self.decision_scores.columns = ["score"]
        self.threshold = average(thresholds_norm)[0]
        self.label = self.get_label(self.decision_scores)
 def setUp(self):
     self.n_train = 200
     self.n_test = 100
     self.contamination = 0.1
     self.roc_floor = 0.8
     self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
         n_train=self.n_train, n_test=self.n_test,
         contamination=self.contamination, random_state=42)
     self.X_train, self.X_test = standardizer(self.X_train, self.X_test)
     self.detector_list = [LOF(), LOF()]
     self.clf = LSCP(self.detector_list, contamination=self.contamination)
     self.clf.fit(self.X_train)
Esempio n. 6
0
    def __train_classifiers(self):
        scaler = MinMaxScaler(feature_range=(0, 1))
        X = scaler.fit_transform(self.df.copy())
        classifiers = self.__load_classifiers()
        scores = np.zeros([X.shape[0], len(classifiers)])
        for i, (clf_name, clf) in enumerate(classifiers.items()):
            try:
                clf.fit(X)
                scores[:, i] = clf.decision_scores_
            except Exception as e:
                print("Failed for ", clf_name)
                print("because of ", e)

        standard_scores = standardizer(scores)
        combined_scores = maximization(standard_scores)
        return combined_scores
Esempio n. 7
0
def stratified_cv(X, y, num_folds):

    folds = []
    skf = StratifiedKFold(n_splits=num_folds)

    splits = skf.split(X, y)

    for train_index, test_index in splits:
        X_train, X_test = X[train_index], X[test_index]

        y_train, y_test = y[train_index], y[test_index]

        X_train, X_test = standardizer(X_train, X_test)

        folds.append((X_train, y_train, X_test, y_test))

    return folds
Esempio n. 8
0
    def _get_decision_scores(self, X):

        # ensure local region size is within acceptable limits
        self.local_region_size = max(self.local_region_size, self.local_region_min)
        self.local_region_size = min(self.local_region_size, self.local_region_max)

        # standardize test data and get local region for each test instance
        X_test_norm = X
        ind_arr = self._get_local_region(X_test_norm)

        # calculate test scores
        test_scores = np.zeros([X_test_norm.shape[0], self.n_clf])
        for k, estimator in enumerate(self.estimator_list):
            test_scores[:, k] = estimator.decision_function(X_test_norm)

        # generate standardized scores
        train_scores_norm, test_scores_norm = standardizer(self.train_scores_, test_scores)

        # generate pseudo target for training --> for calculating weights
        self.training_pseudo_label_ = np.max(train_scores_norm, axis=1).reshape(-1, 1)

        # placeholder for predictions
        pred_scores_ens = np.zeros([X_test_norm.shape[0], ])

        # iterate through test instances (ind_arr indices correspond to x_test)
        for i, ind_k in enumerate(ind_arr):

            # get pseudo target and training scores in local region of test instance
            local_pseudo_ground_truth = self.training_pseudo_label_[ind_k,].ravel()
            local_train_scores = train_scores_norm[ind_k, :]

            # calculate pearson correlation between local pseudo ground truth and local train scores
            pearson_corr_scores = np.zeros([self.n_clf, ])
            for d in range(self.n_clf):
                pearson_corr_scores[d,] = pearsonr(local_pseudo_ground_truth, local_train_scores[:, d])[0]

            # return best score
            pred_scores_ens[i,] = np.mean(
                test_scores_norm[i, self._get_competent_detectors(pearson_corr_scores)])

        return pred_scores_ens
Esempio n. 9
0
    def test_normalization(self):

        # test when X_t is presented and no scalar
        norm_X_train, norm_X_test = standardizer(self.X_train, self.X_test)
        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)

        assert_allclose(norm_X_test.mean(), 0, atol=0.05)
        assert_allclose(norm_X_test.std(), 1, atol=0.05)

        # test when X_t is not presented and no scalar
        norm_X_train = standardizer(self.X_train)
        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)

        # test when X_t is presented and the scalar is kept
        norm_X_train, norm_X_test, scalar = standardizer(self.X_train,
                                                         self.X_test,
                                                         keep_scalar=True)

        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)

        assert_allclose(norm_X_test.mean(), 0, atol=0.05)
        assert_allclose(norm_X_test.std(), 1, atol=0.05)

        if not hasattr(scalar, 'fit') or not hasattr(scalar, 'transform'):
            raise AttributeError("%s is not a detector instance." % (scalar))

        # test when X_t is not presented and the scalar is kept
        norm_X_train, scalar = standardizer(self.X_train, keep_scalar=True)

        assert_allclose(norm_X_train.mean(), 0, atol=0.05)
        assert_allclose(norm_X_train.std(), 1, atol=0.05)

        if not hasattr(scalar, 'fit') or not hasattr(scalar, 'transform'):
            raise AttributeError("%s is not a detector instance." % (scalar))

        # test shape difference
        with assert_raises(ValueError):
            standardizer(self.X_train, self.X_test_diff)
Esempio n. 10
0
    except TypeError:
        print('{data_file} does not exist. Use generated data'.format(
            data_file=mat_file))
        X, y = generate_data(train_only=True)  # load data
    except IOError:
        print('{data_file} does not exist. Use generated data'.format(
            data_file=mat_file))
        X, y = generate_data(train_only=True)  # load data
    else:
        X = mat['X']
        y = mat['y'].ravel()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    n_clf = 20  # number of base detectors

    # Initialize 20 base detectors for combination
    k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140,
              150, 160, 170, 180, 190, 200]

    train_scores = np.zeros([X_train.shape[0], n_clf])
    test_scores = np.zeros([X_test.shape[0], n_clf])

    print('Combining {n_clf} kNN detectors'.format(n_clf=n_clf))

    for i in range(n_clf):
        k = k_list[i]
Esempio n. 11
0
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]

    roc_mat = np.zeros([n_ite, n_classifiers])
    prn_mat = np.zeros([n_ite, n_classifiers])
    time_mat = np.zeros([n_ite, n_classifiers])

    for i in range(n_ite):
        print("\n... Processing", mat_file, '...', 'Iteration', i + 1)
        random_state = np.random.RandomState(i)

        # 60% data for training and 40% for testing
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=0.4, random_state=random_state)

        # standardizing data for processing
        X_train_norm, X_test_norm = standardizer(X_train, X_test)

        classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(
            contamination=outliers_fraction),
            'Cluster-based Local Outlier Factor': CBLOF(
                contamination=outliers_fraction, check_estimator=False,
                random_state=random_state),
            'Feature Bagging': FeatureBagging(contamination=outliers_fraction,
                                              check_estimator=False,
                                              random_state=random_state),
            'Histogram-base Outlier Detection (HBOS)': HBOS(
                contamination=outliers_fraction),
            'Isolation Forest': IForest(contamination=outliers_fraction,
                                        random_state=random_state),
            'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
            'Local Outlier Factor (LOF)': LOF(
            rp_flags[starts[i]:starts[i + 1]],
            None,
            approx_flags[starts[i]:starts[i + 1]],
            verbose=True)
        for i in range(n_jobs))

    print('Orig decision_function time:', time.time() - start)
    print()

    # unfold and generate the label matrix
    predicted_scores_orig = np.zeros([X.shape[0], n_estimators])
    for i in range(n_jobs):
        predicted_scores_orig[:, starts[i]:starts[i + 1]] = np.asarray(
            all_results_scores[i]).T
    ##########################################################################
    predicted_scores = standardizer(predicted_scores)
    predicted_scores_orig = standardizer(predicted_scores_orig)

    evaluate_print('orig', y_test, np.mean(predicted_scores_orig, axis=1))
    evaluate_print('new', y_test, np.mean(predicted_scores, axis=1))
    
#%%

    ##########################################################################
    start = time.time()
    for i in range(n_estimators):
        print(i)
        trained_estimators[i].predict(X)

    print('Orig decision_function time:', time.time() - start)
    print()
Esempio n. 13
0
    def _get_decision_scores(self, X):
        """ Helper function for getting outlier scores on test data X (note:
        model must already be fit)

        Parameters
        ----------
        X : numpy array, shape (n_samples, n_features)
            Test data

        Returns
        -------
        pred_scores_ens : numpy array, shape (n_samples,)
            Outlier scores for test samples
        """

        # raise warning if local region size is outside acceptable limits
        if (self.local_region_size < self.local_region_min) or (
                self.local_region_size > self.local_region_max):
            warnings.warn("Local region size of {} is outside "
                          "recommended range [{}, {}]".format(
                self.local_region_size, self.local_region_min,
                self.local_region_max))

        # standardize test data and get local region for each test instance
        X_test_norm = X
        test_local_regions = self._get_local_region(X_test_norm)

        # calculate test scores
        test_scores = np.zeros([X_test_norm.shape[0], self.n_clf])
        for k, detector in enumerate(self.detector_list):
            test_scores[:, k] = detector.decision_function(X_test_norm)

        # generate standardized scores
        train_scores_norm, test_scores_norm = standardizer(self.train_scores_,
                                                           test_scores)

        # generate pseudo target for training --> for calculating weights
        self.training_pseudo_label_ = np.max(train_scores_norm,
                                             axis=1).reshape(-1, 1)

        # placeholder for ensemble predictions
        pred_scores_ens = np.zeros([X_test_norm.shape[0], ])

        # iterate through test instances (test_local_regions
        # indices correspond to x_test)
        for i, test_local_region in enumerate(test_local_regions):

            # get pseudo target and training scores in local region of
            # test instance
            local_pseudo_ground_truth = self.training_pseudo_label_[
                test_local_region,].ravel()
            local_train_scores = train_scores_norm[test_local_region, :]

            # calculate pearson correlation between local pseudo ground truth
            # and local train scores
            pearson_corr_scores = np.zeros([self.n_clf, ])
            for d in range(self.n_clf):
                pearson_corr_scores[d,] = pearsonr(
                    local_pseudo_ground_truth, local_train_scores[:, d])[0]

            # return best score
            pred_scores_ens[i,] = np.mean(
                test_scores_norm[
                    i, self._get_competent_detectors(pearson_corr_scores)])

        return pred_scores_ens
Esempio n. 14
0
    ]

    mat_file = mat_file_list[0]
    mat_file_name = mat_file.replace('.mat', '')
    print("\n... Processing", mat_file_name, '...')
    mat = sp.io.loadmat(os.path.join('', 'datasets', mat_file))

    X = mat['X']
    y = mat['y']

    # split dataset into train and test
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.4, random_state=42)

    # standardize data to be digestible for most algorithms
    X_train, X_test = standardizer(X_train, X_test)

    contamination = y.sum() / len(y)

    # get estimators for training and prediction
    base_estimators = get_estimators(contamination=contamination)

    ##########################################################################
    model = SUOD(base_estimators=base_estimators,
                 rp_flag_global=True,
                 approx_clf=approx_clf,
                 n_jobs=n_jobs,
                 bps_flag=True,
                 contamination=contamination,
                 approx_flag_global=True)
Esempio n. 15
0
def run_all_models(all_array, labels, pca, data_set_name):
    picture_name = all_array.get("# img", 1)
    all_array = all_array.drop("# img", 1)

    # standardizing data for processing
    all_array = standardizer(all_array)

    y = labels.get("in").to_numpy()
    x_train, x_test, y_train, y_test, picture_train, picture_test = train_test_split(all_array, y, picture_name,
                                                                                     test_size=0.4)

    if pca:
        transformer = IncrementalPCA()
        all_array = transformer.fit_transform(all_array)

    print("OCSVM")
    now = time()
    clf = OCSVM()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("OCSVM", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("Auto-encoder")
    now = time()
    clf = AutoEncoder(epochs=30)
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("Auto-encoder", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("HBOS")
    now = time()
    clf = HBOS()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("HBOS", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("SO_GAAL")
    now = time()
    clf = SO_GAAL()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("SO_GAAL", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("MO_GAAL")
    now = time()
    clf = MO_GAAL()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("MO_GAAL", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("MCD")
    now = time()
    clf = MCD()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("MCD", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("SOS")
    now = time()
    clf = SOS()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("SOS", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("IForest")
    now = time()
    clf = IForest()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("IFrorest", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("KNN")
    now = time()
    clf = KNN()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("KNN", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("PCA")
    now = time()
    clf = PCA()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("PCA", all_array.shape, temp, data_set_name, time() - now, scores_train))
Esempio n. 16
0
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)

    print ('Dataset Shape:', X.shape)
    print ('Outliers Percentage', outliers_percentage)

    # construct containers for saving results of each dataset
    roc_list = []
    prn_list = []
    time_list = []

    # 60% data for training and 40% for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
                                                        random_state=random_state)
    
    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    # define classifiers
    classifiers = define_classifiers(random_state, outliers_fraction)

    # create df for results
    train_results = pd.DataFrame(columns=classifiers.keys())
    test_results = pd.DataFrame(columns=classifiers.keys())

    print ('\n', 'Outliers Detection', '\n')
    for clf_name, clf in classifiers.items():

        # keep name of convetional models (once on the first itteration)
        if num_mat == 0:
            method_names.append(clf_name)
Esempio n. 17
0
def normalize_data(data):
    return standardizer(data)
Esempio n. 18
0
    print('processing file '+ file[-8:-4])
    print('----------')
    df = pd.read_csv(file)
    x = df.drop(['ground.truth','point.id','motherset','origin','original.label'],axis = 1).values
    y = df['ground.truth'].values
    y = [0 if i == 'nominal' else 1 for i in y]

    outliers_fraction = min(np.count_nonzero(y) / len(y),0.5)
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)
    
    roc_list = [file[-8:-4], x.shape[0], x.shape[1], outliers_percentage]
    prn_list = [file[-8:-4], x.shape[0], x.shape[1], outliers_percentage]
    time_list = [file[-8:-4], x.shape[0], x.shape[1], outliers_percentage]
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4,random_state=random_state)
    x_train_norm, x_test_norm = standardizer(x_train, x_test)
    
    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(
        contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor': CBLOF(
            contamination=outliers_fraction, check_estimator=False,
            random_state=random_state),
        'Feature Bagging': FeatureBagging(contamination=outliers_fraction,
                                          random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)': HBOS(
            contamination=outliers_fraction),
        'Isolation Forest': IForest(contamination=outliers_fraction,
                                    random_state=random_state),
        'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)': LOF(
            contamination=outliers_fraction),
Esempio n. 19
0
from vae import VAE
from pyod.utils.data import generate_data, evaluate_print
from pyod.utils.utility import standardizer

if __name__ == "__main__":
    # contamination = 0.1  # percentage of outliers
    # n_train = 20000  # number of training points
    # n_test = 2000  # number of testing points

    X_image = np.load('train_image_embedding.npy')
    X_text = np.load('word2vec.npy')

    X = np.concatenate([X_image, X_text], axis=1)
    n_features = X.shape[1]  # number of features

    X_transformed = standardizer(X)
    # # train VAE detector (Beta-VAE)
    clf_name = 'VAE'
    clf = VAE(epochs=50, latent_dim=128, gamma=1, capacity=0)
    clf.fit(X_transformed)

    # # get the prediction labels and outlier scores of the training data
    # y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    # y_train_scores = clf.decision_scores_  # raw outlier scores

    # # get the prediction on the test data
    # y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    # y_test_scores = clf.decision_function(X_test)  # outlier scores

    # # evaluate and print the results
    # print("\nOn Training Data:")
Esempio n. 20
0
    IForest(random_state=42),
    LOF(),
    OCSVM(),
    PCA(),
    KNN(),
    HBOS(),
    COPOD(),
    AutoEncoder(verbose=0),
    VAE(latent_dim=32, verbosity=0)
]

for embedding, modality in zip(unimodal_embeddings, unimodality):
    print()
    print(modality)
    print()
    embedding_scaled = standardizer(embedding)

    for clf in clfs:
        # print(clf)
        clf.fit(embedding_scaled)
        evaluate_print(clf.__class__.__name__, anomaly_label,
                       clf.decision_scores_)

#%%
image_text_embedding = [
    np.load(os.path.join("unimodality", "image", "train_image_embedding.npy")),
    np.load(os.path.join("unimodality", "language", "word2vec.npy")),
]

print("score averaging")
Esempio n. 21
0
    def _get_decision_scores(self, X):
        """ Helper function for getting outlier scores on test data X (note:
        model must already be fit)

        Parameters
        ----------
        X : numpy array, shape (n_samples, n_features)
            Test data

        Returns
        -------
        pred_scores_ens : numpy array, shape (n_samples,)
            Outlier scores for test samples
        """

        # raise warning if local region size is outside acceptable limits
        if (self.local_region_size < self.local_region_min) or (
                self.local_region_size > self.local_region_max):
            warnings.warn("Local region size of {} is outside "
                          "recommended range [{}, {}]".format(
                self.local_region_size, self.local_region_min,
                self.local_region_max))

        # standardize test data and get local region for each test instance
        X_test_norm = X
        test_local_regions = self._get_local_region(X_test_norm)

        # calculate test scores
        test_scores = np.zeros([X_test_norm.shape[0], self.n_clf])
        for k, detector in enumerate(self.detector_list):
            test_scores[:, k] = detector.decision_function(X_test_norm)

        # generate standardized scores
        train_scores_norm, test_scores_norm = standardizer(self.train_scores_,
                                                           test_scores)

        # generate pseudo target for training --> for calculating weights
        self.training_pseudo_label_ = np.max(train_scores_norm,
                                             axis=1).reshape(-1, 1)

        # placeholder for ensemble predictions
        pred_scores_ens = np.zeros([X_test_norm.shape[0], ])

        # iterate through test instances (test_local_regions
        # indices correspond to x_test)
        for i, test_local_region in enumerate(test_local_regions):

            # get pseudo target and training scores in local region of
            # test instance
            local_pseudo_ground_truth = self.training_pseudo_label_[
                test_local_region,].ravel()
            local_train_scores = train_scores_norm[test_local_region, :]

            # calculate pearson correlation between local pseudo ground truth
            # and local train scores
            pearson_corr_scores = np.zeros([self.n_clf, ])
            for d in range(self.n_clf):
                pearson_corr_scores[d,] = pearsonr(
                    local_pseudo_ground_truth, local_train_scores[:, d])[0]

            # return best score
            pred_scores_ens[i,] = np.mean(
                test_scores_norm[
                    i, self._get_competent_detectors(pearson_corr_scores)])

        return pred_scores_ens
Esempio n. 22
0
            if i[1][5] == 'anomaly':
                y.append(1)
                contam += 1
            else:
                y.append(0)
            x_train.append(list(i[1][6:17]))
    x_train = np.array(x_train)
    y = np.array(y)
    contam /= len(y)

    algorithms = ['KNN', 'LOF', 'PCA', 'LODA']
    all_scores = {}

    clf_name = 'KNN'
    clf = KNN(n_neighbors=5, contamination=contam)
    x_train = standardizer(x_train)
    clf.fit(x_train)
    knn_y_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    knn_y_scores = clf.decision_scores_  # raw outlier scores
    evaluation(y, knn_y_scores, clf_name)
    all_scores['KNN'] = knn_y_scores

    clf_name = 'LOF'
    clf = LOF(contamination=contam)
    x_train = standardizer(x_train)
    clf.fit(x_train)
    y_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_scores = clf.decision_scores_  # raw outlier scores
    evaluation(y, y_scores, clf_name)
    all_scores['LOF'] = y_scores
Esempio n. 23
0
def analyze_selected_algorithm(file_id, dataset_title, selected_algortihm):
    clf_name = selected_algortihm.split()[-1].strip("()")
    mat = db_queries.get_dataframe(file_id)
    filename = "analyze_{}_{}_{}.png".format(
        file_id, clf_name, len(os.listdir("./images/"))
    )
    path = "./images/{}".format(filename)

    mat = mat.drop(["Unnamed: 0", "Index", "id", "Id"], axis=1, errors="ignore")

    y = mat["outlier"].values
    X = mat.drop("outlier", axis=1).values
    X_embedded = TSNE(n_components=2).fit_transform(X)

    outliers_fraction = np.count_nonzero(y) / len(y)

    b = np.arange(X.shape[0]).reshape((X.shape[0], 1))
    X = np.hstack((X, b))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    train_ids = X_train[:, -1].astype(int)
    X_train = X_train[:, :-1]
    test_ids = X_test[:, -1].astype(int)
    X_test = X_test[:, :-1]

    # standardizing data for processing, mean=0, var=1
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    if clf_name in ["PCA", "IFOREST"]:
        # clf = algo_mapping[clf_name](
        #     contamination=outliers_fraction, random_state=random_state
        # )
        clf = algo_mapping[clf_name](contamination=outliers_fraction)
    else:
        clf = algo_mapping[clf_name](contamination=outliers_fraction)

    clf.fit(X_train_norm)
    test_scores = clf.decision_function(X_test_norm)
    roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
    y_test_predicted = clf.predict(X_test_norm)

    print(X_train_norm.shape)

    if X_train_norm.shape[1] > 2:
        # Building the Plot.
        fig = plt.figure(figsize=(10, 4))

        fig.add_subplot(1, 2, 1)
        X_out, X_in = X_embedded[test_ids[y_test == 1]], X_embedded[test_ids[y_test == 0]]
        plt.scatter(X_in[:, 0], X_in[:, 1], color="blue", marker="^", alpha=0.4)
        plt.scatter(X_out[:, 0], X_out[:, 1], color="orange", marker="h", alpha=0.5)
        plt.title("Ground truth")

        fig.add_subplot(1, 2, 2)
        X_out, X_in = (
            X_embedded[test_ids[y_test_predicted == 1]],
            X_embedded[test_ids[y_test_predicted == 0]],
        )
        plt.scatter(X_in[:, 0], X_in[:, 1], color="blue", marker="^", alpha=0.4)
        plt.scatter(X_out[:, 0], X_out[:, 1], color="orange", marker="h", alpha=0.5)
        plt.title("Predicted")

        sptl = plt.suptitle(
            "Датасет: {}, ROC: {}\nАлгоритм: {}".format(dataset_title[:-4], roc, clf_name),
            y=1.08,
            fontsize=14,
        )
        lgd = plt.legend(
            labels=["Нормальные данные", "Аномальные данные"],
            title="Обозначения",
            shadow=True,
            ncol=1,
            fontsize=12,
            loc="center left",
            bbox_to_anchor=(1, 0.5),
        )
        plt.savefig(path, dpi=100, bbox_extra_artists=(lgd, sptl), bbox_inches="tight")
        plt.close()
    else:
        # Building the Plot.
        fig = plt.figure(figsize=(10, 4))

        fig.add_subplot(1, 2, 1)
        X_out, X_in = X[test_ids[y_test == 1]][:, :-1], X[test_ids[y_test == 0]][:, :-1]
        plt.scatter(X_in[:, 0], X_in[:, 1], color="blue", marker="^", alpha=0.4)
        plt.scatter(X_out[:, 0], X_out[:, 1], color="orange", marker="h", alpha=0.5)
        plt.title("Ground truth")

        fig.add_subplot(1, 2, 2)
        X_out, X_in = (
            X[test_ids[y_test_predicted == 1]][:, :-1],
            X[test_ids[y_test_predicted == 0]][:, :-1],
        )
        plt.scatter(X_in[:, 0], X_in[:, 1], color="blue", marker="^", alpha=0.4)
        plt.scatter(X_out[:, 0], X_out[:, 1], color="orange", marker="h", alpha=0.5)
        plt.title("Predicted")

        sptl = plt.suptitle(
            "Датасет: {}, ROC: {}\nАлгоритм: {}".format(dataset_title[:-4], roc, clf_name),
            y=1.08,
            fontsize=14,
        )
        lgd = plt.legend(
            labels=["Нормальные данные", "Аномальные данные"],
            title="Обозначения",
            shadow=True,
            ncol=1,
            fontsize=12,
            loc="center left",
            bbox_to_anchor=(1, 0.5),
        )
        plt.savefig(path, dpi=100, bbox_extra_artists=(lgd, sptl), bbox_inches="tight")
        plt.close()

    return filename
Esempio n. 24
0
train_scores = pd.DataFrame({'clf1': clf1.decision_scores_,
                             'clf2': clf2.decision_scores_,
                             'clf3': clf3.decision_scores_
                            })

test_scores  = pd.DataFrame({'clf1': clf1.decision_function(X_test),
                             'clf2': clf2.decision_function(X_test),
                             'clf3': clf3.decision_function(X_test) 
                            })



# Although we did standardization before, it was for the variables.
# Now we do the standardization for the decision scores
from pyod.utils.utility import standardizer
train_scores_norm, test_scores_norm = standardizer(train_scores,test_scores)

# Combination by average
y_by_average = average(test_scores_norm)
             
import matplotlib.pyplot as plt
plt.hist(y_by_average, bins='auto')  # arguments are passed to np.histogram
plt.title("Combination by average")
plt.show()


df_test = pd.DataFrame(X_test)
df_test['y_by_average_score'] = y_by_average
df_test['y_by_average_cluster'] = np.where(df_test['y_by_average_score']<0, 0, 1)
df_test['y_by_average_cluster'].value_counts()
Esempio n. 25
0
    with open(fileName) as data:
        lines = data.readlines()
        for line in lines:
            lineData = line.strip().split(' ')
            lineData = list(map(lambda x: float(x), lineData))
            dataMat.append(lineData)
    return (np.array(dataMat))


data = data_loadDataSet()

X_train, y_train, X_test, y_test = generate_data(n_train=50,
                                                 n_test=50,
                                                 contamination=0.1,
                                                 random_state=42)
X_train, X_test = standardizer(X_train, X_test)
detector_list = [LOF(n_neighbors=10), LOF(n_neighbors=15)]
clf = LSCP(detector_list)
clf.fit(X_train)
clf.fit(data)
y_train_scores = clf.decision_scores_

sort_factor = argsort(y_train_scores, kind='quicksort')
print(sort_factor)
sort_factors = sort_factor[::-1]
print(sort_factors)
np.savetxt(r'C:\Users\zz\Desktop\res\lscp\D1_2.txt',
           sort_factors,
           fmt='%f',
           delimiter=' ')
Esempio n. 26
0
    roc_mat = np.zeros([n_ite, n_classifiers])
    prn_mat = np.zeros([n_ite, n_classifiers])
    ap_mat = np.zeros([n_ite, n_classifiers])
    time_mat = np.zeros([n_ite, n_classifiers])

    for i in range(n_ite):
        print("\n... Processing", mat_file, '...', 'Iteration', i + 1)
        random_state = np.random.RandomState(i)

        # 60% data for training and 40% for testing
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=0.4, random_state=random_state)

        # standardizing data for processing
        X_train_norm, X_test_norm = standardizer(X_train, X_test)

        classifiers = {'COD_L': COD(contamination=outliers_fraction, tail='left'),
                       'COD_R': COD(contamination=outliers_fraction, tail='right'),
                       'COD_B': COD(contamination=outliers_fraction, tail='both'),
                       'COD_S': COD(contamination=outliers_fraction, tail='skew'),
                       'COD_M': COD(contamination=outliers_fraction, tail='max'),
                       'COD': COD(contamination=outliers_fraction)
                       }
        classifiers_indices = {
            'COD_L': 0,
            'COD_R': 1,
            'COD_B': 2,
            'COD_S': 3,
            'COD_M': 4,
            'COD': 5
Esempio n. 27
0
    def analysis():
        roc_df = pd.DataFrame(columns=df_columns)
        prn_df = pd.DataFrame(columns=df_columns)

        for doc in fileList:
            print(doc)
            df = pd.read_csv(doc, encoding='utf-8')
            # x =df.loc[:,('V1','V2','V3','V4','V5','V6','V7')]
            x = df.loc[:, ('R', 'G', 'B')]
            # x=df.iloc[:,6:57]
            y = df.loc[:, 'original.label']
            roc_list = [count, doc]
            count = count + 1
            roc_mat = np.zeros(6)
            # 设置 5%的离群点数据
            random_state = np.random.RandomState(42)
            outliers_fraction = 0.02
            # 定义6个后续会使用的离群点检测模型
            classifiers = {
                "Feature Bagging":
                FeatureBagging(LOF(n_neighbors=35),
                               contamination=outliers_fraction,
                               check_estimator=False,
                               random_state=random_state),
                "Isolation Forest":
                IForest(contamination=outliers_fraction,
                        random_state=random_state),
                "KNN":
                KNN(contamination=outliers_fraction),
                'Local Outlier Factor':
                LOF(contamination=outliers_fraction),
                'One-class SVM':
                OCSVM(contamination=outliers_fraction),
                'Principal Component Analysis':
                PCA(contamination=outliers_fraction,
                    random_state=random_state),
            }
            classifiers_indices = {
                'Feature Bagging': 0,
                'Isolation Forest': 1,
                "Average KNN": 2,
                'Local Outlier Factor': 3,
                'One-class SVM': 4,
                'Principal Component Analysis': 5,
            }
            # 60% data for training and 40% for testing
            X_train, X_test, y_train, y_test = \
                train_test_split(x, y, test_size=0.4, random_state=random_state)

            # standardizing data for processing
            X_train_norm, X_test_norm = standardizer(X_train, X_test)
            for i, (clf_name, clf) in enumerate(classifiers.items()):
                clf.fit(X_train_norm, y_train)
                # 预测离群点得分
                scores_pred = clf.decision_function(X_test_norm)
                try:
                    roc = round(roc_auc_score(y_test, scores_pred), ndigits=4)
                    roc_mat[classifiers_indices[clf_name]] = roc
                except ValueError:
                    continue
            roc_list = roc_list + roc_mat.tolist()
            temp_df = pd.DataFrame(roc_list).transpose()
            temp_df.columns = [
                'Data', 'dir', 'FB', 'IForest', 'Average KNN', 'LOF', 'OCSVM',
                'PCA'
            ]
            roc_df = pd.concat([roc_df, temp_df], axis=0)

            roc_df.to_csv("roc.csv", index=False, float_format="%.3f")
Esempio n. 28
0
    roc_mean = []
    roc_max = []
    roc_aom = []
    roc_moa = []

    prn_mean = []
    prn_max = []
    prn_aom = []
    prn_moa = []

    for t in range(ite):
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                            test_size=0.4)

        # standardizing data for processing
        X_train_norm, X_test_norm = standardizer(X_train, X_test)

        # initialize 20 base detectors for combination
        k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140,
                  150, 160, 170, 180, 190, 200]

        train_scores = np.zeros([X_train.shape[0], n_clf])
        test_scores = np.zeros([X_test.shape[0], n_clf])

        for i in range(n_clf):
            k = k_list[i]

            clf = Knn(n_neighbors=k, method='largest')
            clf.fit(X_train_norm)

            train_scores[:, i] = clf.decision_scores.ravel()
Esempio n. 29
0
    return


if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      contamination=contamination,
                      random_state=42)
    X_train, X_test = standardizer(X_train, X_test)

    # train lscp
    clf_name = 'LSCP'
    detector_list = [LOF(), LOF()]
    clf = LSCP(detector_list, random_state=42)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores