Esempio n. 1
0
def train():
    # initiate the train
    try:
        # 1-. Take data and configuration
        data = pd.read_csv(training_path, index_col=0)

        # Read in any configuration stored
        with open(param_path, 'r') as tc:
            hyper_parameters = json.load(tc)

        # 2-. Set up
        # instantiate the Isolation Forest model
        model = IForest(contamination=hyper_parameters['contamination'],
                        behaviour='new')
        model.fit(data)  # fit

        # 3-. Save the model
        model_name = 'great_model'
        with open(os.path.join(model_path, '{}.pkl'.format(model_name)),
                  'wb') as out:
            pickle.dump(model, out, protocol=0)

    # consider that the train fails
    except Exception as e:
        # write the log
        trc = traceback.format_exc()
        with open(os.path.join(output_path, 'failure'), 'w') as s:
            s.write('Exception during train: ' + str(e) + '\n' + trc)
        sys.exit(255)
Esempio n. 2
0
    def fit(self, X_train, y_train=None):
        """Fit the model. y is ignored in unsupervised methods.

       Parameters
       ----------
       X_train : numpy array of shape (n_samples, n_features)
           The input samples.

       y_train : Ignored
           Not used, present for API consistency by convention.

       Returns
       -------
       self : object
           The fitted estimator.
       """
        self.model_ = IForest(
            n_estimators=self.n_estimators,
            max_samples=self.max_samples,
            contamination=self.contamination,
            max_features=1.,
            bootstrap=False,
            n_jobs=-1,
            behaviour='deprecated',  # no use any more in sklean 0.24.
            random_state=self.random_state,
            verbose=self.verbose)

        self.model_.fit(X=X_train)

        return self
Esempio n. 3
0
def dorc(preprocessedData, random_state, outliers_fraction=0.1):

    t0 = time.time()
    clf = IForest(contamination=outliers_fraction,
                  random_state=random_state,
                  n_jobs=-1)
    clf.fit(preprocessedData)
    scores = clf.decision_function(preprocessedData)

    # Apply IQR-based criteria to identify rare cells for further downstream analysis.
    q3 = np.percentile(scores, 75)
    iqr = stats.iqr(scores)
    th = q3 + (1.5 * iqr)

    # Select indexes that satisfy IQR-based thresholding criteria.
    indIqr = np.where(scores >= th)[0]
    print('shape of selected cells : {}'.format(indIqr.shape))

    # Create a file with binary predictions
    predictions = np.zeros(preprocessedData.shape[0])
    predictions[indIqr] = 1  # Replace predictions for rare cells with '1'.

    t1 = time.time()
    duration = round(t1 - t0, ndigits=4)
    print("Total running DoRC time is :" + str(duration) + " s")

    return predictions, scores, duration
Esempio n. 4
0
 def densityBased(self):
     '''
     @brief Function that implements the dependency based component
     @param self
     @return It returns the vector with the scores of the instances
     '''
     # Initialize the scores
     scores = np.array([0] * len(self.dataset)).astype(float)
     for i in range(self.num_iter):
         iforest = IForest(contamination=self.contamination,
                           behaviour="new")
         # Number in the interval [50, 1000]
         subsample_size = np.random.randint(50, 1001)
         sample = []
         if subsample_size >= len(self.dataset):
             sample = list(range(len(self.dataset)))
         else:
             # Take the sample and train the model
             sample = np.random.choice(len(self.dataset),
                                       size=subsample_size,
                                       replace=False)
         iforest.fit(self.dataset[sample])
         # Update the score to compute the mean
         scores[sample] += iforest.decision_scores_
     # Return the mean
     scores = scores / self.num_iter
     scores = scale(scores)
     return scores
Esempio n. 5
0
def anomaly_detection(data, label):
    X = data[data.select_dtypes('number').columns.tolist()]
    y = data[label]
    y = y.values
    X = X.drop([label], axis=1)

    sc = StandardScaler()
    X = pd.DataFrame(data=sc.fit_transform(X), columns=X.columns)

    ifo = IForest(contamination=0.01,
                  behaviour='new',
                  n_estimators=1000,
                  max_samples=1024,
                  n_jobs=-1,
                  verbose=1)
    ifo.fit(X)
    ifo_pred = ifo.labels_
    print('ROC score for Isolation forest: ', roc_auc_score(y, ifo_pred))
    utilities.plot_outlier_scores(
        y,
        ifo.decision_scores_,
        bw=0.1,
        title='Fraud, Isolation forest. (n_estimators={})'.format(
            ifo.n_estimators))

    ae = AutoEncoder(hidden_neurons=[25, 20, 15, 20, 25],
                     hidden_activation='relu',
                     output_activation='sigmoid',
                     optimizer='adam',
                     epochs=20,
                     batch_size=128,
                     dropout_rate=0.2,
                     l2_regularizer=0.0,
                     validation_size=0.1,
                     preprocessing=False,
                     verbose=1,
                     random_state=1,
                     contamination=0.01)
    ae.fit(X)
    ae_pred = ae.labels_
    print('ROC score for Autoencoder: ', roc_auc_score(y, ae_pred))
    utilities.plot_outlier_scores(
        y,
        ae.decision_scores_,
        bw=0.1,
        title='Fraud, Autoencoder. (epochs={})'.format(ae.epochs))

    # Too long to train, under-sample needed
    lof = LOF(n_neighbors=int(y.sum() * 1.3), contamination=0.01, n_jobs=-1)
    lof.fit(X)
    lof_pred = lof.labels_
    print('ROC score for LOF: ', roc_auc_score(y, lof_pred))
    utilities.plot_outlier_scores(
        y,
        lof.decision_scores_,
        bw=0.1,
        title='Fraud, Local outliers factor. (n_neighbors={})'.format(
            lof.n_neighbors))

    return y, ifo_pred, ae_pred, lof_pred
Esempio n. 6
0
class IForestSupervisedKNN(BaseDetector):
    def __init__(self, get_top=0.8, if_params={}, knn_params={}):
        super(IForestSupervisedKNN, self).__init__()
        self.get_top = get_top
        self.is_fitted = False

        self.iforest = IForest(**if_params)

        self.knn = KNN(**knn_params)

    def fit(self, X, y=None):

        X = check_array(X)
        self._set_n_classes(y)

        self.iforest.fit(X)

        scores = self.iforest.predict_proba(X)[:, 1]

        normal_instances = X[np.argsort(scores)[:int(len(X) * self.get_top)]]

        self.knn.fit(normal_instances)

        self.decision_scores_ = self.decision_function(X)
        self._process_decision_scores()

        self.is_fitted = True

        return self

    def decision_function(self, X):

        check_is_fitted(self, ['is_fitted'])

        return self.knn.decision_function(X)
Esempio n. 7
0
def iforest(X_train, X_test, Y_train, Y_test):
    from pyod.models.iforest import IForest
    model = IForest(random_state=0)
    model.fit(X_train)
    pred = model.predict(X_test)
    acc = np.sum(pred == Y_test) / X_test.shape[0]
    print(acc)
    return (acc * 100)
Esempio n. 8
0
    def __init__(self, get_top=0.8, if_params={}, knn_params={}):
        super(IForestSupervisedKNN, self).__init__()
        self.get_top = get_top
        self.is_fitted = False

        self.iforest = IForest(**if_params)

        self.knn = KNN(**knn_params)
Esempio n. 9
0
def do_iforest(x, n_estimators=100, max_samples=512):
    clf = IForest(behaviour="new",
                  n_estimators=n_estimators,
                  max_samples=max_samples,
                  random_state=None)
    y_pred = clf.fit_predict(x)
    scores = clf.decision_function(x)
    index = np.where(y_pred == 1)[0]
    return clf, scores, index
Esempio n. 10
0
def remove_outliars(dft, target_col):

    ol_model = IForest()  #### can be used as a hyperparameter
    ol_model.fit(dft.drop(columns=target_col))
    dft['is_outliar'] = ol_model.labels_
    dft = dft[dft['is_outliar'] != 1]
    dft = dft.drop(columns='is_outliar')
    print("Completed Outliar Detection - ", datetime.datetime.now())

    return dft
Esempio n. 11
0
def outlier_iforest(data, **kwargs):
    import pandas as pd
    from pyod.models.iforest import IForest
    contamination = float(kwargs.pop('contamination'))
    clf = IForest(contamination=contamination)
    clf.fit(data)
    pred = clf.labels_
    df = pd.DataFrame(pred, columns=['is_outlier'])
    ret = pd.concat([data, df], axis=1)
    return ret
Esempio n. 12
0
    def S2(self):

        self.S1()
        water_data = self.water_data
        result = self.result

        # 数据预处理及模型训练
        clean_data = water_data[water_data['S1'] == 0]
        Y = pd.DataFrame(index=clean_data.index, columns=['S2'])

        X_train = np.array(clean_data.iloc[:, 1:12])
        name = list(clean_data.iloc[:, 1:12].columns.values)
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)

        clf1 = IForest(contamination=0.05, max_features=11, bootstrap=True)
        clf2 = KNN(contamination=0.05, n_neighbors=100)
        clf3 = HBOS(contamination=0.05, n_bins=10)
        clf4 = PCA(contamination=0.05)

        clf1.fit(X_train)
        clf2.fit(X_train)
        clf3.fit(X_train)
        clf4.fit(X_train)

        Y['S2'] = clf1.labels_ * clf2.labels_ * clf3.labels_ * clf4.labels_
        water_data = pd.concat([water_data, Y], axis=1)
        # water_data.loc[water_data['S2'].isna(),['S2']]=0,将S1中异常的,在S2中标注为0;

        result['统计异常'] = water_data['S2'].values

        # 寻找异常维度
        from sklearn.neighbors import KernelDensity
        clean_data = water_data[water_data['S1'] == 0]
        dens = pd.DataFrame(index=clean_data.index,
                            columns=[
                                'temperature', 'pH', 'EC', 'ORP', 'DO',
                                'turbidity', 'transparency', 'COD', 'P',
                                'NH3N', 'flux'
                            ])

        for i in dens.columns:
            kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(
                clean_data[i].values.reshape(-1, 1))
            dens[i] = np.exp(
                kde.score_samples(clean_data[i].values.reshape(-1, 1)))
        dens = dens.iloc[:, 0:11].rank()
        dens['S2_names'] = dens.idxmin(axis=1)
        water_data = pd.concat([water_data, dens['S2_names']], axis=1)
        self.water_data = water_data
        result['统计异常维度'] = water_data['S2_names'].values

        # 存储模型
        joblib.dump(scaler, "./water_model/S2_scaler")
        joblib.dump(clf1, "./water_model/S2_Iforest")
Esempio n. 13
0
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = IForest(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)
Esempio n. 14
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination)

        self.clf = IForest(contamination=self.contamination)
        self.clf.fit(self.X_train)
def getOutlierIForest(dataset):
    '''
    @brief Function that executes IForest algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model without verbose
    ifor = IForest(verbose=0)
    # Fits the data and obtains labels
    ifor.fit(dataset)
    # Return labels
    return ifor.labels_
Esempio n. 16
0
 def detect(self, X, y=None):
     """
     :param X: Dataframe
     :param y: np.array
     :return: outlier scores
     """
     rng = np.random.RandomState(42)
     # 构造训练样本
     n_estimators = 200  # 森林中树的棵数
     outliers_fraction = 0.5  # 异常样本比例
     clf = IForest(max_samples='auto', random_state=rng, contamination=outliers_fraction, n_estimators=n_estimators)
     clf.fit(X)
     scores = clf.decision_function(X)
     return scores
Esempio n. 17
0
class IForestPyOD(BaseAlgorithm):
    name = "iForest_pyod"

    def __init__(self, t=100, psi=256):

        self.iforest = IForest(max_samples=psi, n_estimators=t, behaviour="new", contamination=0.1)

    def fit(self, X):

        self.iforest.fit(X)

    def predict(self, X):

        return self.iforest.decision_function(X)
Esempio n. 18
0
def detect_outliers(stocks: list, all_stocks_cip: pd.DataFrame, rules=None):
    """
    Returns a dataframe describing those outliers present in stocks based on the provided rules.
    """
    if rules is None:
        rules = default_point_score_rules()
    str_rules = { str(r):r for r in rules }
    rows = []
    stocks_by_sector_df = stocks_by_sector() # NB: ETFs in watchlist will have no sector
    stocks_by_sector_df.index = stocks_by_sector_df['asx_code']
    for stock in stocks:
        #print("Processing stock: ", stock)
        try:
           sector = stocks_by_sector_df.at[stock, 'sector_name']
           sector_companies = list(stocks_by_sector_df.loc[stocks_by_sector_df['sector_name'] == sector].asx_code)
           # day_low_high() may raise KeyError when data is currently being fetched, so it appears here...
           day_low_high_df = day_low_high(stock, all_stocks_cip.columns)
        except KeyError:
           warning(None, "Unable to locate watchlist entry: {} - continuing without it".format(stock))
           continue
        state = {
            'day_low_high_df': day_low_high_df,  # never changes each day, so we init it here
            'all_stocks_change_in_percent_df': all_stocks_cip,
            'stock': stock,
            'daily_range_threshold': 0.20, # 20% at either end of the daily range gets a point
        }
        points_by_rule = defaultdict(int)
        for date in all_stocks_cip.columns:
            market_avg = all_stocks_cip[date].mean()
            sector_avg = all_stocks_cip[date].filter(items=sector_companies).mean()
            stock_move = all_stocks_cip.at[stock, date]
            state.update({ 'market_avg': market_avg, 'sector_avg': sector_avg,
                           'stock_move': stock_move, 'date': date })
            for rule_name, rule in str_rules.items():
                points_by_rule[rule_name] += rule(state)
        d = { 'stock': stock }
        d.update(points_by_rule)
        rows.append(d)
    df = pd.DataFrame.from_records(rows)
    df = df.set_index('stock')
    print(df)
    from pyod.models.iforest import IForest
    clf = IForest()
    clf.fit(df)
    scores = clf.predict(df)
    results = [row[0] for row, value in zip(df.iterrows(), scores) if value > 0]
    #print(results)
    print("Found {} outlier stocks".format(len(results)))
    return results
Esempio n. 19
0
def add_other_class(num, size, pad):
    res = pd.read_csv("data/train.txt", header=None).values
    tif_data = []
    for r in tqdm(range(res.shape[0])):
        img = get_cell(res[r][1], res[r][2], size)
        if img is None:
            print("img NOT Exist.", res[r])
            continue
        img = img.reshape(-1).tolist()
        tif_data.append([labels_key[res[r][0]]] + img)
    tif_data = np.array(tif_data)
    print(tif_data.shape)

    np.random.shuffle(tif_data)
    clf = IForest()
    clf.fit(tif_data[:, 1:])

    i = 0
    pos = []
    false_num = 0
    while True:
        ix = np.random.randint(pad, dataset.RasterXSize - pad)
        iy = np.random.randint(pad, dataset.RasterYSize - pad)
        t = get_cell(ix, iy, size)
        if t is None:
            continue
        t = t.reshape(1, -1)
        y_test_pred = clf.predict(t)[0]  # outlier labels (0 or 1)
        if y_test_pred == 1:
            i += 1
            pos.append(["其他"] + [ix, iy])
            print("{}/{} added.".format(i, num))
        else:
            false_num += 1
            print("{}/{} is not include {}.{}. false_num: {}".format(
                i, num, ix, iy, false_num))

        if i == num:
            break
    pos = np.concatenate((res, np.array(pos)), axis=0)
    print(Counter(pos[:, 0]))

    pd.DataFrame(pos).to_csv("data/train_enhance.txt", index=None, header=None)

    pos[:, 2] = -1 * (pos[:, 2].astype(np.int))
    pd.DataFrame(pos).to_csv("data/train_enhance_view.txt",
                             index=None,
                             header=None)
Esempio n. 20
0
    def transform(self, df2: pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe."""

        le = LabelEncoder()
        df2['mm'] = df2['make'] + ' ' + df2['model']
        g_mm_count = df2.groupby(['mm']).count().reset_index()
        mm_more_than_100 = g_mm_count[g_mm_count['make'] > 100]['mm']
        df2 = df2[df2['mm'].isin(mm_more_than_100)]
        dfn3 = df2.copy()
        g1 = dfn3.groupby('mm')
        clf1 = IForest(contamination=0.01)
        flag = [1]

        if 1 in flag:

            dff1 = pd.DataFrame(columns=[
                'idv_id', 'kms_run', 'owners', 'age', 'Popularity Index',
                'quoted_price', 'outlier', 'dep_percentage'
            ])

            for idv_id, idv_id_df in g1:
                idv_id_df1 = idv_id_df[[
                    'kms_run', 'owners', 'age', 'quoted_price',
                    'dep_percentage'
                ]]
                clf1.fit(idv_id_df1)
                y_pred = clf1.predict(idv_id_df1)
                idv_id_df['outlier'] = y_pred.tolist()
                dff1 = pd.concat([dff1, idv_id_df])
            outlier_idv_if_dff1 = set(dff1[dff1['outlier'] == 1].index)

        df2 = df2.drop(outlier_idv_if_dff1)
        df = df2.copy()
        X = df[[
            'make', 'model', 'city', 'variant', 'owners', 'kms_run', 'age',
            'Popularity Index', 'ex_showroom_price', 'fuel_type',
            'transmission', 'color'
        ]]
        categorical_feature_mask = X.dtypes == object
        categorical_cols = X.columns[categorical_feature_mask].tolist()
        self.dic = {}
        for i in categorical_cols:
            X[i] = le.fit_transform(X[i])
            self.dic[i] = dict(zip(le.classes_, le.transform(le.classes_)))
        y = df[['dep_percentage']]
        aa = pd.concat([X, y], axis=1)

        return aa
Esempio n. 21
0
def define_classifiers(random_state, outliers_fraction):
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state)
    }
    return classifiers
Esempio n. 22
0
    def __load_classifiers(self):
        outliers_fraction = 0.05
        random_state = np.random.RandomState(0)

        classifiers = {
            'Cluster-based Local Outlier Factor (CBLOF)':
            CBLOF(contamination=outliers_fraction,
                  check_estimator=False,
                  random_state=random_state),
            'Feature Bagging':
            FeatureBagging(LOF(n_neighbors=35),
                           contamination=outliers_fraction,
                           random_state=random_state),
            'Histogram-base Outlier Detection (HBOS)':
            HBOS(contamination=outliers_fraction),
            'Isolation Forest':
            IForest(contamination=outliers_fraction,
                    random_state=random_state),
            'K Nearest Neighbors (KNN)':
            KNN(contamination=outliers_fraction),
            'Average KNN':
            KNN(method='mean', contamination=outliers_fraction),
            'Local Outlier Factor (LOF)':
            LOF(n_neighbors=35, contamination=outliers_fraction),
            'Minimum Covariance Determinant (MCD)':
            MCD(contamination=outliers_fraction, random_state=random_state),
            'One-class SVM (OCSVM)':
            OCSVM(contamination=outliers_fraction),
        }

        return classifiers
Esempio n. 23
0
def train_model(X_train, contamination, model):
    
    """
    Train the model based on the user's choice (KNN, IForest, OCSVM).

    Parameters
    ----------
    X_train       : list of shape (n_train, n_features) containing the training set with only features.
    contamination : float representing the expected proportion of anomalies in the training set.
    model         : string, claiming the model to use for evaluating the confidence. It can be one of: KNN, IForest, OCSVM.

    Returns
    ----------
    clf           : obj with the model trained on the training set.
    
    """
    np.random.seed(331)
    n = np.shape(X_train)[0]
    if model == 'KNN':
        clf = KNN(n_neighbors=max(np.int(n*contamination),1), contamination = contamination).fit(X_train)
    elif model == 'IForest':
        clf = IForest(contamination = contamination, random_state = 331).fit(X_train)
    elif model == 'OCSVM':
        clf = OCSVM(contamination = contamination).fit(X_train)
        
    return clf
Esempio n. 24
0
def construct_raw_base_estimators():
    from pyod.models.knn import KNN
    from pyod.models.lof import LOF
    from pyod.models.cblof import CBLOF
    from pyod.models.hbos import HBOS
    from pyod.models.iforest import IForest
    from pyod.models.abod import ABOD
    from pyod.models.ocsvm import OCSVM

    estimator_list = []

    # predefined range of n_neighbors for KNN, AvgKNN, and LOF
    k_range = [3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

    for k in k_range:
        estimator_list.append(
            KNN(n_neighbors=k, method="largest", contamination=0.05))
        estimator_list.append(
            KNN(n_neighbors=k, method="mean", contamination=0.05))
        estimator_list.append(LOF(n_neighbors=k, contamination=0.05))

    # predefined range of nu for one-class svm
    nu_range = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99]
    for nu in nu_range:
        estimator_list.append(OCSVM(nu=nu, contamination=0.05))

    # predefined range for number of estimators in isolation forests
    n_range = [10, 20, 50, 70, 100, 150, 200, 250]
    for n in n_range:
        estimator_list.append(
            IForest(n_estimators=n, random_state=42, contamination=0.05))

    return estimator_list
Esempio n. 25
0
def train_model(X, Y, contamination, name, from_scratch=True):
    model_dir = './model'
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    file_name = name + '.pkl'

    if from_scratch:
        if name == 'ocsvm':
            model = OCSVM(contamination=contamination)
            model.fit(X)
        elif name == 'iforest':
            model = IForest(contamination=contamination)
            model.fit(X)
        elif name == 'lof':
            model = LOF(contamination=contamination)
            model.fit(X)
        elif name == 'knn':
            model = KNN(contamination=contamination)
            model.fit(X)
        elif name == 'xgbod':
            model = XGBOD(contamination=contamination)
            model.fit(X, Y)

        save(model, model_dir, file_name)

    else:
        model = load(model_dir, file_name)

    return model
Esempio n. 26
0
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test):
    if method == 'KNN':
        clf = KNN()
    elif method == 'CBLOF':
        clf = CBLOF()
    elif method == 'PCA':
        clf = PCA()
    else:
        clf = IForest()
    clf.fit(x_train)  # 使用x_train训练检测器clf

    # 返回训练数据x_train上的异常标签和异常分值
    y_train_pred = clf.labels_  # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
    y_train_scores = clf.decision_scores_  # 返回训练数据上的异常值 (分值越大越异常)
    print("On train Data:")
    evaluate_print(method, y_train, y_train_scores)

    # 用训练好的clf来预测未知数据中的异常值
    y_test_pred = clf.predict(x_test)  # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
    y_test_scores = clf.decision_function(x_test)  # 返回未知数据上的异常值 (分值越大越异常)
    print("On Test Data:")
    evaluate_print(method, y_test, y_test_scores)

    y_true = column_or_1d(y_test)
    y_pred = column_or_1d(y_test_scores)
    check_consistent_length(y_true, y_pred)

    roc = np.round(roc_auc_score(y_true, y_pred), decimals=4),
    prn = np.round(precision_n_scores(y_true, y_pred), decimals=4)

    total_roc.append(roc)
    total_prn.append(prn)
Esempio n. 27
0
class IForestWrapper:
    def __init__(self, **kwargs):
        self._model = IForest(**kwargs)

    def fit(self, X, T):
        # unsupervised learning Targets not used
        self._model.fit(X)
        return self

    def predict(self, X):
        Y = self._model.predict(X)
        return Y

    def predict_proba(self, X):
        probs = self._model.predict_proba(X)
        return probs
Esempio n. 28
0
def load_classifiers(outliers_fraction):
    outliers_fraction = min(0.5, outliers_fraction)
    random_state = np.random.RandomState(42)
    # Define nine outlier detection tools to be compared
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction,
                random_state=random_state,
                behaviour="new"),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Average KNN':
        KNN(method='mean', contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(n_neighbors=35, contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state)
    }
    return classifiers
Esempio n. 29
0
 def model_init(self, model):
     """Model initialisation of a single model.
     """
     if self.model == 'pca':
         self.models[model] = PCA(contamination=self.contamination)
     elif self.model == 'loda':
         self.models[model] = LODA(contamination=self.contamination)
     elif self.model == 'iforest':
         self.models[model] = IForest(n_estimators=50,
                                      bootstrap=True,
                                      behaviour='new',
                                      contamination=self.contamination)
     elif self.model == 'cblof':
         self.models[model] = CBLOF(n_clusters=3,
                                    contamination=self.contamination)
     elif self.model == 'feature_bagging':
         self.models[model] = FeatureBagging(
             base_estimator=PCA(contamination=self.contamination),
             contamination=self.contamination)
     elif self.model == 'copod':
         self.models[model] = COPOD(contamination=self.contamination)
     elif self.model == 'hbos':
         self.models[model] = HBOS(contamination=self.contamination)
     else:
         self.models[model] = HBOS(contamination=self.contamination)
     self.custom_model_scalers[model] = MinMaxScaler()
Esempio n. 30
0
    def setUp(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'cardio.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        self.base_estimators = [LOF(), LOF(), IForest(), COPOD()]
        self.clf = SUOD(base_estimators=self.base_estimators)
        self.clf.fit(self.X_train)
        self.roc_floor = 0.7
Esempio n. 31
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = IForest(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)
Esempio n. 32
0
class TestIForest(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = IForest(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)
        assert_true(hasattr(self.clf, 'estimators_') and
                    self.clf.estimators_ is not None)
        assert_true(hasattr(self.clf, 'estimators_samples_') and
                    self.clf.estimators_samples_ is not None)
        assert_true(hasattr(self.clf, 'max_samples_') and
                    self.clf.max_samples_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
Esempio n. 33
0
if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train IForest detector
    clf_name = 'IForest'
    clf = IForest()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)