Beispiel #1
0
def dorc(preprocessedData, random_state, outliers_fraction=0.1):

    t0 = time.time()
    clf = IForest(contamination=outliers_fraction,
                  random_state=random_state,
                  n_jobs=-1)
    clf.fit(preprocessedData)
    scores = clf.decision_function(preprocessedData)

    # Apply IQR-based criteria to identify rare cells for further downstream analysis.
    q3 = np.percentile(scores, 75)
    iqr = stats.iqr(scores)
    th = q3 + (1.5 * iqr)

    # Select indexes that satisfy IQR-based thresholding criteria.
    indIqr = np.where(scores >= th)[0]
    print('shape of selected cells : {}'.format(indIqr.shape))

    # Create a file with binary predictions
    predictions = np.zeros(preprocessedData.shape[0])
    predictions[indIqr] = 1  # Replace predictions for rare cells with '1'.

    t1 = time.time()
    duration = round(t1 - t0, ndigits=4)
    print("Total running DoRC time is :" + str(duration) + " s")

    return predictions, scores, duration
Beispiel #2
0
def anomaly_detection(data, label):
    X = data[data.select_dtypes('number').columns.tolist()]
    y = data[label]
    y = y.values
    X = X.drop([label], axis=1)

    sc = StandardScaler()
    X = pd.DataFrame(data=sc.fit_transform(X), columns=X.columns)

    ifo = IForest(contamination=0.01,
                  behaviour='new',
                  n_estimators=1000,
                  max_samples=1024,
                  n_jobs=-1,
                  verbose=1)
    ifo.fit(X)
    ifo_pred = ifo.labels_
    print('ROC score for Isolation forest: ', roc_auc_score(y, ifo_pred))
    utilities.plot_outlier_scores(
        y,
        ifo.decision_scores_,
        bw=0.1,
        title='Fraud, Isolation forest. (n_estimators={})'.format(
            ifo.n_estimators))

    ae = AutoEncoder(hidden_neurons=[25, 20, 15, 20, 25],
                     hidden_activation='relu',
                     output_activation='sigmoid',
                     optimizer='adam',
                     epochs=20,
                     batch_size=128,
                     dropout_rate=0.2,
                     l2_regularizer=0.0,
                     validation_size=0.1,
                     preprocessing=False,
                     verbose=1,
                     random_state=1,
                     contamination=0.01)
    ae.fit(X)
    ae_pred = ae.labels_
    print('ROC score for Autoencoder: ', roc_auc_score(y, ae_pred))
    utilities.plot_outlier_scores(
        y,
        ae.decision_scores_,
        bw=0.1,
        title='Fraud, Autoencoder. (epochs={})'.format(ae.epochs))

    # Too long to train, under-sample needed
    lof = LOF(n_neighbors=int(y.sum() * 1.3), contamination=0.01, n_jobs=-1)
    lof.fit(X)
    lof_pred = lof.labels_
    print('ROC score for LOF: ', roc_auc_score(y, lof_pred))
    utilities.plot_outlier_scores(
        y,
        lof.decision_scores_,
        bw=0.1,
        title='Fraud, Local outliers factor. (n_neighbors={})'.format(
            lof.n_neighbors))

    return y, ifo_pred, ae_pred, lof_pred
Beispiel #3
0
class IForestSupervisedKNN(BaseDetector):
    def __init__(self, get_top=0.8, if_params={}, knn_params={}):
        super(IForestSupervisedKNN, self).__init__()
        self.get_top = get_top
        self.is_fitted = False

        self.iforest = IForest(**if_params)

        self.knn = KNN(**knn_params)

    def fit(self, X, y=None):

        X = check_array(X)
        self._set_n_classes(y)

        self.iforest.fit(X)

        scores = self.iforest.predict_proba(X)[:, 1]

        normal_instances = X[np.argsort(scores)[:int(len(X) * self.get_top)]]

        self.knn.fit(normal_instances)

        self.decision_scores_ = self.decision_function(X)
        self._process_decision_scores()

        self.is_fitted = True

        return self

    def decision_function(self, X):

        check_is_fitted(self, ['is_fitted'])

        return self.knn.decision_function(X)
def train():
    # initiate the train
    try:
        # 1-. Take data and configuration
        data = pd.read_csv(training_path, index_col=0)

        # Read in any configuration stored
        with open(param_path, 'r') as tc:
            hyper_parameters = json.load(tc)

        # 2-. Set up
        # instantiate the Isolation Forest model
        model = IForest(contamination=hyper_parameters['contamination'],
                        behaviour='new')
        model.fit(data)  # fit

        # 3-. Save the model
        model_name = 'great_model'
        with open(os.path.join(model_path, '{}.pkl'.format(model_name)),
                  'wb') as out:
            pickle.dump(model, out, protocol=0)

    # consider that the train fails
    except Exception as e:
        # write the log
        trc = traceback.format_exc()
        with open(os.path.join(output_path, 'failure'), 'w') as s:
            s.write('Exception during train: ' + str(e) + '\n' + trc)
        sys.exit(255)
Beispiel #5
0
 def densityBased(self):
     '''
     @brief Function that implements the dependency based component
     @param self
     @return It returns the vector with the scores of the instances
     '''
     # Initialize the scores
     scores = np.array([0] * len(self.dataset)).astype(float)
     for i in range(self.num_iter):
         iforest = IForest(contamination=self.contamination,
                           behaviour="new")
         # Number in the interval [50, 1000]
         subsample_size = np.random.randint(50, 1001)
         sample = []
         if subsample_size >= len(self.dataset):
             sample = list(range(len(self.dataset)))
         else:
             # Take the sample and train the model
             sample = np.random.choice(len(self.dataset),
                                       size=subsample_size,
                                       replace=False)
         iforest.fit(self.dataset[sample])
         # Update the score to compute the mean
         scores[sample] += iforest.decision_scores_
     # Return the mean
     scores = scores / self.num_iter
     scores = scale(scores)
     return scores
Beispiel #6
0
def iforest(X_train, X_test, Y_train, Y_test):
    from pyod.models.iforest import IForest
    model = IForest(random_state=0)
    model.fit(X_train)
    pred = model.predict(X_test)
    acc = np.sum(pred == Y_test) / X_test.shape[0]
    print(acc)
    return (acc * 100)
def outlier_iforest(data, **kwargs):
    import pandas as pd
    from pyod.models.iforest import IForest
    contamination = float(kwargs.pop('contamination'))
    clf = IForest(contamination=contamination)
    clf.fit(data)
    pred = clf.labels_
    df = pd.DataFrame(pred, columns=['is_outlier'])
    ret = pd.concat([data, df], axis=1)
    return ret
Beispiel #8
0
def remove_outliars(dft, target_col):

    ol_model = IForest()  #### can be used as a hyperparameter
    ol_model.fit(dft.drop(columns=target_col))
    dft['is_outliar'] = ol_model.labels_
    dft = dft[dft['is_outliar'] != 1]
    dft = dft.drop(columns='is_outliar')
    print("Completed Outliar Detection - ", datetime.datetime.now())

    return dft
Beispiel #9
0
    def S2(self):

        self.S1()
        water_data = self.water_data
        result = self.result

        # 数据预处理及模型训练
        clean_data = water_data[water_data['S1'] == 0]
        Y = pd.DataFrame(index=clean_data.index, columns=['S2'])

        X_train = np.array(clean_data.iloc[:, 1:12])
        name = list(clean_data.iloc[:, 1:12].columns.values)
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)

        clf1 = IForest(contamination=0.05, max_features=11, bootstrap=True)
        clf2 = KNN(contamination=0.05, n_neighbors=100)
        clf3 = HBOS(contamination=0.05, n_bins=10)
        clf4 = PCA(contamination=0.05)

        clf1.fit(X_train)
        clf2.fit(X_train)
        clf3.fit(X_train)
        clf4.fit(X_train)

        Y['S2'] = clf1.labels_ * clf2.labels_ * clf3.labels_ * clf4.labels_
        water_data = pd.concat([water_data, Y], axis=1)
        # water_data.loc[water_data['S2'].isna(),['S2']]=0,将S1中异常的,在S2中标注为0;

        result['统计异常'] = water_data['S2'].values

        # 寻找异常维度
        from sklearn.neighbors import KernelDensity
        clean_data = water_data[water_data['S1'] == 0]
        dens = pd.DataFrame(index=clean_data.index,
                            columns=[
                                'temperature', 'pH', 'EC', 'ORP', 'DO',
                                'turbidity', 'transparency', 'COD', 'P',
                                'NH3N', 'flux'
                            ])

        for i in dens.columns:
            kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(
                clean_data[i].values.reshape(-1, 1))
            dens[i] = np.exp(
                kde.score_samples(clean_data[i].values.reshape(-1, 1)))
        dens = dens.iloc[:, 0:11].rank()
        dens['S2_names'] = dens.idxmin(axis=1)
        water_data = pd.concat([water_data, dens['S2_names']], axis=1)
        self.water_data = water_data
        result['统计异常维度'] = water_data['S2_names'].values

        # 存储模型
        joblib.dump(scaler, "./water_model/S2_scaler")
        joblib.dump(clf1, "./water_model/S2_Iforest")
Beispiel #10
0
def outlier_detection(x_raw, y_raw):
    """
    Filter all ourlier points
    :param x_raw: feature in ndarray
    :param y_raw: label in ndarray
    :return x_clean, y_clean: cleaned feature and label in ndarray
    """
    # TODO Filter the outliers.
    print()
    print("Detecting outliers...")
    print("Before outlier detection: {}".format(x_raw.shape))
    outliers_fraction = 0.04
    random_state = np.random.RandomState(42)
    # all outlier detection method candidate list as follows
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state),
        'Improving Supervised Outlier Detection with Unsupervised Representation Learning':
        XGBOD(contamination=outliers_fraction),
    }
    clf_name = 'Isolation Forest'
    clf = IForest(contamination=outliers_fraction, random_state=random_state)
    # clf_name = 'Angle-based Outlier Detector (ABOD)'
    # clf = ABOD(contamination=outliers_fraction, method='default')
    clf.fit(x_raw)
    y_pred = clf.predict(x_raw)
    # for pyod, 1 means outliers and 0 means inliers
    # for sklearn,  -1 means outliers and 1 means inliers
    idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1]
    x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0)
    y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0)
    print("After outlier detection: {}".format(x_clean.shape))
    assert (x_clean.shape[0] == y_clean.shape[0])
    return x_clean, y_clean
def getOutlierIForest(dataset):
    '''
    @brief Function that executes IForest algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model without verbose
    ifor = IForest(verbose=0)
    # Fits the data and obtains labels
    ifor.fit(dataset)
    # Return labels
    return ifor.labels_
Beispiel #12
0
class IForestPyOD(BaseAlgorithm):
    name = "iForest_pyod"

    def __init__(self, t=100, psi=256):

        self.iforest = IForest(max_samples=psi, n_estimators=t, behaviour="new", contamination=0.1)

    def fit(self, X):

        self.iforest.fit(X)

    def predict(self, X):

        return self.iforest.decision_function(X)
 def detect(self, X, y=None):
     """
     :param X: Dataframe
     :param y: np.array
     :return: outlier scores
     """
     rng = np.random.RandomState(42)
     # 构造训练样本
     n_estimators = 200  # 森林中树的棵数
     outliers_fraction = 0.5  # 异常样本比例
     clf = IForest(max_samples='auto', random_state=rng, contamination=outliers_fraction, n_estimators=n_estimators)
     clf.fit(X)
     scores = clf.decision_function(X)
     return scores
Beispiel #14
0
def detect_outliers(stocks: list, all_stocks_cip: pd.DataFrame, rules=None):
    """
    Returns a dataframe describing those outliers present in stocks based on the provided rules.
    """
    if rules is None:
        rules = default_point_score_rules()
    str_rules = { str(r):r for r in rules }
    rows = []
    stocks_by_sector_df = stocks_by_sector() # NB: ETFs in watchlist will have no sector
    stocks_by_sector_df.index = stocks_by_sector_df['asx_code']
    for stock in stocks:
        #print("Processing stock: ", stock)
        try:
           sector = stocks_by_sector_df.at[stock, 'sector_name']
           sector_companies = list(stocks_by_sector_df.loc[stocks_by_sector_df['sector_name'] == sector].asx_code)
           # day_low_high() may raise KeyError when data is currently being fetched, so it appears here...
           day_low_high_df = day_low_high(stock, all_stocks_cip.columns)
        except KeyError:
           warning(None, "Unable to locate watchlist entry: {} - continuing without it".format(stock))
           continue
        state = {
            'day_low_high_df': day_low_high_df,  # never changes each day, so we init it here
            'all_stocks_change_in_percent_df': all_stocks_cip,
            'stock': stock,
            'daily_range_threshold': 0.20, # 20% at either end of the daily range gets a point
        }
        points_by_rule = defaultdict(int)
        for date in all_stocks_cip.columns:
            market_avg = all_stocks_cip[date].mean()
            sector_avg = all_stocks_cip[date].filter(items=sector_companies).mean()
            stock_move = all_stocks_cip.at[stock, date]
            state.update({ 'market_avg': market_avg, 'sector_avg': sector_avg,
                           'stock_move': stock_move, 'date': date })
            for rule_name, rule in str_rules.items():
                points_by_rule[rule_name] += rule(state)
        d = { 'stock': stock }
        d.update(points_by_rule)
        rows.append(d)
    df = pd.DataFrame.from_records(rows)
    df = df.set_index('stock')
    print(df)
    from pyod.models.iforest import IForest
    clf = IForest()
    clf.fit(df)
    scores = clf.predict(df)
    results = [row[0] for row, value in zip(df.iterrows(), scores) if value > 0]
    #print(results)
    print("Found {} outlier stocks".format(len(results)))
    return results
def add_other_class(num, size, pad):
    res = pd.read_csv("data/train.txt", header=None).values
    tif_data = []
    for r in tqdm(range(res.shape[0])):
        img = get_cell(res[r][1], res[r][2], size)
        if img is None:
            print("img NOT Exist.", res[r])
            continue
        img = img.reshape(-1).tolist()
        tif_data.append([labels_key[res[r][0]]] + img)
    tif_data = np.array(tif_data)
    print(tif_data.shape)

    np.random.shuffle(tif_data)
    clf = IForest()
    clf.fit(tif_data[:, 1:])

    i = 0
    pos = []
    false_num = 0
    while True:
        ix = np.random.randint(pad, dataset.RasterXSize - pad)
        iy = np.random.randint(pad, dataset.RasterYSize - pad)
        t = get_cell(ix, iy, size)
        if t is None:
            continue
        t = t.reshape(1, -1)
        y_test_pred = clf.predict(t)[0]  # outlier labels (0 or 1)
        if y_test_pred == 1:
            i += 1
            pos.append(["其他"] + [ix, iy])
            print("{}/{} added.".format(i, num))
        else:
            false_num += 1
            print("{}/{} is not include {}.{}. false_num: {}".format(
                i, num, ix, iy, false_num))

        if i == num:
            break
    pos = np.concatenate((res, np.array(pos)), axis=0)
    print(Counter(pos[:, 0]))

    pd.DataFrame(pos).to_csv("data/train_enhance.txt", index=None, header=None)

    pos[:, 2] = -1 * (pos[:, 2].astype(np.int))
    pd.DataFrame(pos).to_csv("data/train_enhance_view.txt",
                             index=None,
                             header=None)
Beispiel #16
0
    def transform(self, df2: pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe."""

        le = LabelEncoder()
        df2['mm'] = df2['make'] + ' ' + df2['model']
        g_mm_count = df2.groupby(['mm']).count().reset_index()
        mm_more_than_100 = g_mm_count[g_mm_count['make'] > 100]['mm']
        df2 = df2[df2['mm'].isin(mm_more_than_100)]
        dfn3 = df2.copy()
        g1 = dfn3.groupby('mm')
        clf1 = IForest(contamination=0.01)
        flag = [1]

        if 1 in flag:

            dff1 = pd.DataFrame(columns=[
                'idv_id', 'kms_run', 'owners', 'age', 'Popularity Index',
                'quoted_price', 'outlier', 'dep_percentage'
            ])

            for idv_id, idv_id_df in g1:
                idv_id_df1 = idv_id_df[[
                    'kms_run', 'owners', 'age', 'quoted_price',
                    'dep_percentage'
                ]]
                clf1.fit(idv_id_df1)
                y_pred = clf1.predict(idv_id_df1)
                idv_id_df['outlier'] = y_pred.tolist()
                dff1 = pd.concat([dff1, idv_id_df])
            outlier_idv_if_dff1 = set(dff1[dff1['outlier'] == 1].index)

        df2 = df2.drop(outlier_idv_if_dff1)
        df = df2.copy()
        X = df[[
            'make', 'model', 'city', 'variant', 'owners', 'kms_run', 'age',
            'Popularity Index', 'ex_showroom_price', 'fuel_type',
            'transmission', 'color'
        ]]
        categorical_feature_mask = X.dtypes == object
        categorical_cols = X.columns[categorical_feature_mask].tolist()
        self.dic = {}
        for i in categorical_cols:
            X[i] = le.fit_transform(X[i])
            self.dic[i] = dict(zip(le.classes_, le.transform(le.classes_)))
        y = df[['dep_percentage']]
        aa = pd.concat([X, y], axis=1)

        return aa
Beispiel #17
0
class IForestWrapper:
    def __init__(self, **kwargs):
        self._model = IForest(**kwargs)

    def fit(self, X, T):
        # unsupervised learning Targets not used
        self._model.fit(X)
        return self

    def predict(self, X):
        Y = self._model.predict(X)
        return Y

    def predict_proba(self, X):
        probs = self._model.predict_proba(X)
        return probs
def main():
    dataset, label = pre_data()
    from numpy import nan as NA
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(missing_values=NA, strategy="mean")
    dataset = imputer.fit_transform(dataset)
    x_train, x_test, y_train, y_label = train_test_split(dataset,
                                                         label,
                                                         test_size=0.3,
                                                         random_state=44)
    # x_train, x_test, y_train, y_label =[], [], [], []
    # for i in range(1000):
    #     x_train.append(dataset[i])
    #     y_train.append(label[i])
    # for i in range(6000,10000):
    #     x_train.append(dataset[i])
    #     y_train.append(label[i])
    # x_test = dataset[1000:6000]
    # y_label = label[1000:6000]
    for i in range(3):
        clf_name = 'IForest'
        clf = IForest()
        clf.fit(x_train)

        # get the prediction label and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores
        from sklearn.metrics import accuracy_score
        from sklearn.metrics import precision_score
        from sklearn.metrics import recall_score
        print(accuracy_score(y_train, y_train_pred))
        print(precision_score(y_train, y_train_pred))
        print(recall_score(y_train, y_train_pred))
        # get the prediction on the test data
        y_test_pred = clf.predict(x_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(x_test)  # outlier scores

        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print(accuracy_score(y_label, y_test_pred))
        print(precision_score(y_train, y_train_pred))
        print(recall_score(y_train, y_train_pred))
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_label, y_test_scores)
Beispiel #19
0
def test_pyod_isolation_forest():
    import shap
    import numpy as np
    from pyod.models.iforest import IForest
    from sklearn.ensemble.iforest import _average_path_length

    X, _ = shap.datasets.boston()
    for max_features in [1.0, 0.75]:
        iso = IForest(max_features=max_features)
        iso.fit(X)

        explainer = shap.TreeExplainer(iso)
        shap_values = explainer.shap_values(X)

        score_from_shap = -2**(
            -(np.sum(shap_values, axis=1) + explainer.expected_value) /
            _average_path_length(np.array([iso.max_samples_]))[0])
        assert np.allclose(iso.detector_.score_samples(X),
                           score_from_shap,
                           atol=1e-7)
Beispiel #20
0
    def get_IF_scores(dataframe,
                      cols,
                      outliers_fraction=0.01,
                      standardize=True):
        '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default
        
        Returns:
            df with Isolation Forest (IF) scores added         
        '''
        if standardize:
            #standardize selected variables
            minmax = MinMaxScaler(feature_range=(0, 1))
            dataframe[cols] = minmax.fit_transform(dataframe[cols])

        #Convert dataframe to a numpy array in order to incorprate our algorithm
        arrays = []
        for row in cols:
            row = dataframe[row].values.reshape(-1, 1)
            arrays.append(row)
        X = np.concatenate((arrays), axis=1)

        #fit
        clf = IForest(contamination=outliers_fraction, random_state=0)
        clf.fit(X)

        # predict raw anomaly score
        scores_pred = clf.decision_function(X) * -1

        # prediction of a datapoint category outlier or inlier
        y_pred = clf.predict(X)
        n_inliers = len(y_pred) - np.count_nonzero(y_pred)
        n_outliers = np.count_nonzero(y_pred == 1)

        CheckOutliers.df3 = dataframe
        CheckOutliers.df3['outlier'] = y_pred.tolist()

        print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers,
              'found with HBOS')
Created on Tue Dec 24 15:54:36 2019

@author: zixing.mei
"""

from pyod.models.iforest import IForest
clf = IForest(behaviour='new',
              bootstrap=False,
              contamination=0.1,
              max_features=1.0,
              max_samples='auto',
              n_estimators=500,
              n_jobs=-1,
              random_state=None,
              verbose=0)
clf.fit(x)
out_pred = clf.predict_proba(x, method='linear')[:, 1]
train['out_pred'] = out_pred
train['for_pred'] = np.where(train.out_pred > 0.7, '负样本占比', '正样本占比')
dic = dict(train.groupby(train.for_pred).bad_ind.agg(np.sum)/ \
           train.bad_ind.groupby(train.for_pred).count())
pd.DataFrame(dic, index=[0])

clf = IForest(behaviour='new',
              bootstrap=False,
              contamination=0.1,
              max_features=1.0,
              max_samples='auto',
              n_estimators=500,
              n_jobs=-1,
              random_state=None,
Beispiel #22
0
class TestIForest(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = IForest(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)
        assert_true(hasattr(self.clf, 'estimators_') and
                    self.clf.estimators_ is not None)
        assert_true(hasattr(self.clf, 'estimators_samples_') and
                    self.clf.estimators_samples_ is not None)
        assert_true(hasattr(self.clf, 'max_samples_') and
                    self.clf.max_samples_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
Beispiel #23
0
#Extracting y-labels for the validation data and dropping in X data. Y labels will be the same for all feature sets ofcourse
Y_valid1 = X_valid1['Label_<lambda>']
X_valid1.drop(['Label_<lambda>'], inplace=True, axis=1)

# Reading original test data to extract the malicious flow data after prediction
orig_test_data = pd.read_csv("test_data.csv", header=None)
orig_test_data.columns = ['Date_Flow_Start', 'Duration','Protocol','Src_IP','Src_Port','Direction','Dst_IP','Dst_Port','State','Source_Service','Dest_Service','Total_Packets','BiDirection_Bytes','SrcToDst_Bytes']

"""TRAINING on Feature Set 1

IFOREST on Default Parameters
"""

#Training
clf1 = IForest(random_state = 42) # Default contamination = 0.1
clf1.fit(X_train1)

#Setting threshold using the contamination parameter
dec_scores = clf1.decision_scores_
dec_scores_sorted=sorted(dec_scores, reverse=True)
a = round(len(X_train1) * clf1.contamination)
print(a)

anomalies=dec_scores_sorted[:a]
threshold = anomalies[-1]

# Validation data is scored
y_valid_scores = clf1.decision_function(X_valid1)
y_valid_scores = pd.Series(y_valid_scores)

valid_SrcIP = np.load('preprocessing1_valid_srcIP.npy',allow_pickle=True)
Beispiel #24
0
class IF(IForest):
    def __init__(self,
                 n_estimators=100,
                 max_samples='auto',
                 contamination=0.1,
                 random_state=42,
                 verbose=1):
        """Isolation Forest (IF)

        Parameters
        ----------
        n_estimators : int, optional (default=100)
            The number of base estimators in the ensemble.

        max_samples : int or float, optional (default="auto")
            The number of samples to draw from X to train each base estimator.

        contamination : float in (0., 0.5), optional (default=0.1)
            The amount of contamination of the data set, i.e., the proportion of outliers in the data set.
            Used when fitting to define the threshold on the decision function.

        verbose: int (default is 1)
            A print level is to control what information should be printed according to the given value.
            The higher the value is, the more info is printed.

        random_state: int (default is 42)


        """
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.contamination = contamination
        self.verbose = verbose
        self.random_state = random_state

    def fit(self, X_train, y_train=None):
        """Fit the model. y is ignored in unsupervised methods.

       Parameters
       ----------
       X_train : numpy array of shape (n_samples, n_features)
           The input samples.

       y_train : Ignored
           Not used, present for API consistency by convention.

       Returns
       -------
       self : object
           The fitted estimator.
       """
        self.model_ = IForest(
            n_estimators=self.n_estimators,
            max_samples=self.max_samples,
            contamination=self.contamination,
            max_features=1.,
            bootstrap=False,
            n_jobs=-1,
            behaviour='deprecated',  # no use any more in sklean 0.24.
            random_state=self.random_state,
            verbose=self.verbose)

        self.model_.fit(X=X_train)

        return self

    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        return self.model_.decision_function(X)

    def predict_proba(self, X):
        raise NotImplementedError
Beispiel #25
0
def evaluation_od_train(x,
                        y,
                        data_name,
                        model_name="iforest",
                        chosen_subspace=None):
    """
    using anomaly detector to yield anomaly score for each subspace,
    generate two files: the subspaces with the highest anomaly score & lof score for each subspace
    :param x: data matrix
    :param y: class information
    :param data_name: the data set name, using for naming the ground truth file
    :param model_name: anomaly detector name, default: lof
    :param chosen_subspace: use this to only evaluate a subset of the power set of full feature space
    :return: df: a ground-truth map using anomaly idx as key and ground truth feature subspace as value.
    """
    global chosen_model

    dim = x.shape[1]
    ano_idx = np.where(y == 1)[0]
    n_ano = len(ano_idx)

    # get all the possible feature subset or just use given subset list
    f_subsets = utils.get_subset_candidate(dim, chosen_subspace)

    # score anomalies in each subspace, generate the score matrix
    n_subsets = len(f_subsets)
    score_matrix = np.zeros([n_ano, n_subsets])
    for i in tqdm(range(n_subsets)):
        subset = f_subsets[i]
        x_subset = x[:, subset]

        if model_name == "iforest":
            clf = IForest()
            clf.fit(x_subset)
            od_score = clf.decision_scores_
        elif model_name == "copod":
            clf = COPOD()
            clf.fit(x_subset)
            od_score = clf.decision_scores_
        elif model_name == "hbos":
            clf = HBOS()
            clf.fit(x_subset)
            od_score = clf.decision_scores_
        else:
            raise ValueError("unsupported od model")

        od_score = utils.min_max_norm(od_score)
        score_matrix[:, i] = od_score[ano_idx]

    if not os.path.exists(eva_root + "data_od_evaluation/"):
        os.makedirs(eva_root + "data_od_evaluation/")

    # score matrix to df
    anomaly_score_df = pd.DataFrame(data=score_matrix,
                                    columns=[str(s) for s in f_subsets])
    col_name = anomaly_score_df.columns.tolist()
    col_name.insert(0, 'ano_idx')
    anomaly_score_df["ano_idx"] = ano_idx
    anomaly_score_df = anomaly_score_df.reindex(columns=col_name)
    path1 = eva_root + "data_od_evaluation/" + data_name + "_score_" + model_name + ".csv"
    anomaly_score_df.to_csv(path1, index=False)

    # get the ground truth (one subspace for each anomaly that the anomaly can obtain the highest anomaly score)
    g_truth_df = pd.DataFrame(columns=["ano_idx", "exp_subspace"])

    exp_subspaces = []
    for ii, ano_score in enumerate(score_matrix):
        max_score_idx = int(np.argmax(ano_score))
        exp_subset = str(f_subsets[max_score_idx])
        exp_subspaces.append(exp_subset)
    g_truth_df["ano_idx"] = ano_idx
    g_truth_df["exp_subspace"] = exp_subspaces

    g_truth_df.astype({"exp_subspace": "object"})
    path2 = eva_root + "data_od_evaluation/" + data_name + "_gt_" + model_name + ".csv"
    g_truth_df.to_csv(path2, index=False)
    return anomaly_score_df, g_truth_df
Beispiel #26
0
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train IForest detector
    clf_name = 'IForest'
    clf = IForest()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
Beispiel #27
0
def detect_outliers(stocks: list, all_stocks_cip: pd.DataFrame, rules=None):
    """
    Returns a dataframe describing those outliers present in stocks based on the provided rules.
    All_stocks_cip is the "change in percent" for at least the stocks present in the specified list
    """
    if rules is None:
        rules = default_point_score_rules()
    str_rules = {str(r): r for r in rules}
    rows = []
    stocks_by_sector_df = (stocks_by_sector()
                           )  # NB: ETFs in watchlist will have no sector
    stocks_by_sector_df.index = stocks_by_sector_df["asx_code"]
    for stock in stocks:
        # print("Processing stock: ", stock)
        try:
            sector = stocks_by_sector_df.at[stock, "sector_name"]
            sector_companies = list(stocks_by_sector_df.loc[
                stocks_by_sector_df["sector_name"] == sector].asx_code)
            # day_low_high() may raise KeyError when data is currently being fetched, so it appears here...
            day_low_high_df = day_low_high(stock, all_stocks_cip.columns)
        except KeyError:
            warning(
                None,
                "Unable to locate watchlist entry: {} - continuing without it".
                format(stock),
            )
            continue
        state = {
            "day_low_high_df":
            day_low_high_df,  # never changes each day, so we init it here
            "all_stocks_change_in_percent_df": all_stocks_cip,
            "stock": stock,
            "daily_range_threshold":
            0.20,  # 20% at either end of the daily range gets a point
        }
        points_by_rule = defaultdict(int)
        for date in all_stocks_cip.columns:
            market_avg = all_stocks_cip[date].mean()
            sector_avg = all_stocks_cip[date].filter(
                items=sector_companies).mean()
            stock_move = all_stocks_cip.at[stock, date]
            state.update({
                "market_avg": market_avg,
                "sector_avg": sector_avg,
                "stock_move": stock_move,
                "date": date,
            })
            for rule_name, rule in str_rules.items():
                try:
                    points_by_rule[rule_name] += rule(state)
                except TypeError:  # handle nan's in dataset safely
                    pass
        d = {"stock": stock}
        d.update(points_by_rule)
        rows.append(d)
    df = pd.DataFrame.from_records(rows)
    df = df.set_index("stock")
    # print(df)
    clf = IForest()
    clf.fit(df)
    scores = clf.predict(df)
    results = [
        row[0] for row, value in zip(df.iterrows(), scores) if value > 0
    ]
    # print(results)
    print("Found {} outlier stocks".format(len(results)))
    return results
Beispiel #28
0
def anomaly_rate(model, validation_df, freq, plot=False):
    if freq[:-1].isnumeric() and (freq[-1] == 'S' or freq[-1] == 'D'):
        last_history = (model.start + model.t_scale).round(freq)
    else:
        raise ValueError(
            "Unsupported frequency format. "
            "Provide any valid frequency for pd.date_range, as multiple of 'D' or 'S'."
        )

    first_validation = validation_df['ds'].iloc[0]
    last_validation = validation_df['ds'].iloc[-1]

    if last_validation > last_history:
        if first_validation <= last_history:
            validation_df = validation_df.loc[
                validation_df['ds'] > last_history].dropna()[['ds', 'y']]

        start_timer = time.time()
        future = validation_df['ds'].to_frame(name='ds')
        prediction_data = model.predict(future)[['ds', 'yhat']]  # TOO SLOW!
        print("--- Prediction: %s seconds ---" % (time.time() - start_timer))

        df = pd.DataFrame({
            'y': validation_df['y'].values,
            'yhat': prediction_data['yhat'].values
        })
        scaler = MinMaxScaler(feature_range=(0, 1))
        df[['y', 'yhat']] = scaler.fit_transform(df[['y', 'yhat']])

        clf_name = 'iForest'
        clf = IForest()
        clf.fit(df)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        if plot:
            # fig = plt.figure(facecolor='w', figsize=(10, 6))
            # ax = fig.add_subplot(111)
            # ax.plot(prediction_data['ds'].dt.to_pydatetime(), deviation, 'k.')
            # ax.plot(prediction_data['ds'][y_train_pred == 1].dt.to_pydatetime(), deviation[y_train_pred == 1], 'r.')
            # fig.show()

            fig1 = plt.figure(facecolor='w', figsize=(10, 6))
            ax = fig1.add_subplot(111)
            ax.plot(prediction_data['ds'].dt.to_pydatetime(), y_train_scores)
            ax.plot(
                prediction_data['ds'][y_train_pred == 1].dt.to_pydatetime(),
                y_train_scores[y_train_pred == 1], 'r.')
            fig1.show()

            fig2 = plt.figure(facecolor='w', figsize=(10, 6))
            ax = fig2.add_subplot(111)
            ax.plot(validation_df['ds'].dt.to_pydatetime(),
                    validation_df['y'].values)
            ax.plot(prediction_data['ds'].dt.to_pydatetime(),
                    prediction_data['yhat'].values)
            ax.vlines(
                prediction_data['ds'][y_train_pred == 1].dt.to_pydatetime(),
                min(validation_df['y'].values), max(validation_df['y'].values),
                'r')
            fig2.show()

        return sum(y_train_pred) / len(y_train_pred)

    else:
        raise ValueError(
            "Validation dataset has no data point after last member of time-series of historical data that",
            "the model was trained on. Please use validation dataset with last member of the time series",
            "after %s." % last_history)
# In[19]:

from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)

# Reduce the redunant data
X14 = tsne.fit_transform(unique4)
plt.figure(figsize=(20, 20))
plt.scatter(X14[:, 0], X14[:, 1], c=pca4.labels_)
plt.show()

# In[17]:

from pyod.models.iforest import IForest
iforest2 = IForest()
iforest2.fit(unique2)

# In[15]:

from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)

# Reduce the redunant data
X22 = tsne.fit_transform(unique2)
plt.figure(figsize=(20, 20))
plt.scatter(X22[:, 0], X22[:, 1], c=iforest2.labels_)
plt.show()

# In[22]:

iforest3 = IForest()
Beispiel #30
0
class TestIForest(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = IForest(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_') and
                self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and
                self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_') and
                self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and
                self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and
                self.clf._sigma is not None)
        assert (hasattr(self.clf, 'estimators_') and
                self.clf.estimators_ is not None)
        assert (hasattr(self.clf, 'estimators_samples_') and
                self.clf.estimators_samples_ is not None)
        assert (hasattr(self.clf, 'max_samples_') and
                self.clf.max_samples_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def test_model_clone(self):
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
Beispiel #31
0
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
from pyod.utils.data import visualize

if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    X_train, y_train, X_test, y_test = generate_data(
        n_train=n_train, n_test=n_test, contamination=contamination)

    # train IForest detector
    clf_name = 'IForest'
    clf = IForest()
    clf.fit(X_train)

    # get the prediction label and decision_scores_ on the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
class Remove_Outliers(BaseEstimator, TransformerMixin):
    def __init__(self,
                 target,
                 contamination=.20,
                 random_state=42,
                 methods=['knn', 'iso', 'mcd']):

        self.target = target
        self.contamination = contamination
        self.random_state = random_state
        self.methods = methods

    def fit(self, data, y=None):
        return (None)

    def transform(self, data, y=None):
        return (data)

    def fit_transform(self, dataset, y=None):
        data = dataset.copy()

        if 'iso' in self.methods:
            self.iso_forest = IForest(contamination=self.contamination,
                                      random_state=self.random_state,
                                      behaviour='new')
            self.iso_forest.fit(data.drop(self.target, axis=1))
            iso_predict = self.iso_forest.predict(
                data.drop(self.target, axis=1))
            data['iso'] = iso_predict

        if 'knn' in self.methods:
            self.knn_out = KNN(contamination=self.contamination)
            self.knn_out.fit(data.drop(self.target, axis=1))
            knn_predict = self.knn_out.predict(data.drop(self.target, axis=1))
            data['knn'] = knn_predict

        if 'pca' in self.methods:
            self.out_pca = PCA_RO(contamination=self.contamination,
                                  random_state=self.random_state)
            self.out_pca.fit(data.drop(self.target, axis=1))
            pca_predict = self.out_pca.predict(data.drop(self.target, axis=1))
            data['pca'] = pca_predict

        # use for those features which are gaussian distributed
        if 'mcd' in self.methods:
            self.mcd = EllipticEnvelope(contamination=0.01)
            self.mcd.fit(data.drop(self.target, axis=1))
            mcd_predict = self.mcd.predict(data.drop(self.target, axis=1))
            data['mcd'] = mcd_predict

        data['vote_outlier'] = 0

        for i in self.methods:
            data['vote_outlier'] = data['vote_outlier'] + data[i]

        self.outliers = data[data['vote_outlier'] == len(self.methods)]

        return dataset[[
            True if i not in self.outliers.index else False
            for i in dataset.index
        ]]