def test_iforest_subsampled_features():
    # It tests non-regression for #5732 which failed at predict.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng)
    clf = IsolationForest(max_features=0.8)
    clf.fit(X_train, y_train)
    clf.predict(X_test)
def test_iforest_sparse():
    """Check IForest for various parameter settings on sparse input."""
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)
    grid = ParameterGrid({"max_samples": [0.5, 1.0],
                          "bootstrap": [True, False]})

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        for params in grid:
            # Trained on sparse format
            sparse_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train_sparse)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train)
            dense_results = dense_classifier.predict(X_test)

            assert_array_equal(sparse_results, dense_results)
            assert_array_equal(sparse_results, dense_results)
def outlier_rejection(X, y):
    model = IsolationForest(max_samples=100,
                            contamination=0.4,
                            random_state=rng)
    model.fit(X)
    y_pred = model.predict(X)
    return X[y_pred == 1], y[y_pred == 1]
Example #4
0
    def _predict_self(self):

        clf = IsolationForest(contamination=self.frac)

        clf.fit(self.num_X)

        return clf.predict(self.num_X)
def outlier_rejection(X, y):
    """This will be our function used to resample our dataset."""
    model = IsolationForest(max_samples=100,
                            contamination=0.4,
                            random_state=rng)
    model.fit(X)
    y_pred = model.predict(X)
    return X[y_pred == 1], y[y_pred == 1]
Example #6
0
def test_iforest_parallel_regression():
    """Check parallel regression."""
    rng = check_random_state(0)

    X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng)

    ensemble = IsolationForest(n_jobs=3, random_state=0).fit(X_train)

    ensemble.set_params(n_jobs=1)
    y1 = ensemble.predict(X_test)
    ensemble.set_params(n_jobs=2)
    y2 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y2)

    ensemble = IsolationForest(n_jobs=1, random_state=0).fit(X_train)

    y3 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y3)
Example #7
0
def IsolationForest_calulate(train_data_one,test_data):
    # 使用异常检测方法
    clf = IsolationForest()
    # 训练异常检测模型
    clf.fit(train_data_one)
    # 模型预测
    Pre_result = clf.predict(test_data)
    # 计算多少个概率
    prob = len([x for x in Pre_result if x == 1])/len(Pre_result)
    return prob
Example #8
0
def test_iforest_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]

    # Test LOF
    clf = IsolationForest(random_state=rng)
    clf.fit(X)
    pred = clf.predict(X)

    # assert detect outliers:
    assert_greater(np.min(pred[-2:]), np.max(pred[:-2]))
def test_iforest_works(contamination):
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]

    # Test IsolationForest
    clf = IsolationForest(random_state=rng, contamination=contamination)
    clf.fit(X)
    decision_func = -clf.decision_function(X)
    pred = clf.predict(X)
    # assert detect outliers:
    assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2]))
    assert_array_equal(pred, 6 * [1] + 2 * [-1])
    def isolationForest(self, settings, mname, data):
        '''
        :param settings: -> settings dictionary
        :param mname: -> name of serialized cluster
        :return: -> isolation forest instance
        :example settings: -> {n_estimators:100, max_samples:100, contamination:0.1, bootstrap:False,
                        max_features:1.0, n_jobs:1, random_state:None, verbose:0}
        '''
        # rng = np.random.RandomState(42)
        if settings['random_state'] == 'None':
            settings['random_state'] = None

        if isinstance(settings['bootstrap'], str):
            settings['bootstrap'] = str2Bool(settings['bootstrap'])

        if isinstance(settings['verbose'], str):
            settings['verbose'] = str2Bool(settings['verbose'])

        if settings['max_samples'] != 'auto':
            settings['max_samples'] = int(settings['max_samples'])
        # print type(settings['max_samples'])
        for k, v in settings.iteritems():
            logger.info('[%s] : [INFO] IsolationForest %s set to %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)
            print "IsolationForest %s set to %s" % (k, v)
        try:
            clf = IsolationForest(n_estimators=int(settings['n_estimators']), max_samples=settings['max_samples'], contamination=float(settings['contamination']), bootstrap=settings['bootstrap'],
                        max_features=float(settings['max_features']), n_jobs=int(settings['n_jobs']), random_state=settings['random_state'], verbose=settings['verbose'])
        except Exception as inst:
            logger.error('[%s] : [ERROR] Cannot instanciate isolation forest with %s and %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Error while  instanciating isolation forest with %s and %s" % (type(inst), inst.args)
            sys.exit(1)
        # clf = IsolationForest(max_samples=100, random_state=rng)
        # print "*&*&*&& %s" % type(data)
        try:
            clf.fit(data)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Cannot fit isolation forest model with %s and %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            sys.exit(1)
        predict = clf.predict(data)
        print "Anomaly Array:"
        print predict
        self.__serializemodel(clf, 'isoforest', mname)
        return clf
 def check(self, timeseries_data, check_value):
     data = timeseries_data.values.tolist()
     check_value_list = []
     check_value_list.append([check_value])
     # 孤立森林异常点检查,data是列表形式
     x_train = []
     for i in range(len(data)):
         x_train.append([data[i]])
     clf = IsolationForest(behaviour='new', max_samples=100)
     clf.fit(x_train)
     scores_pred = clf.predict(check_value_list)
     if scores_pred[-1] < 0:
         if check_value > data[-1]:
             return "uprush", scores_pred[-1]
         else:
             return "anticlimax", scores_pred[-1]
     else:
         return "no alarm", 0
Example #12
0
    def predict(self, X, window=DEFAULT_WINDOW):
        """
        Predict if a particular sample is an outlier or not.

        :param X: the time series to detect of
        :param type X: pandas.Series
        :param window: the length of window
        :param type window: int
        :return: 1 denotes normal, 0 denotes abnormal.
        """
        x_train = list(range(0, 2 * window + 1)) + list(range(0, 2 * window + 1)) + list(range(0, window + 1))
        sample_features = zip(x_train, X)
        clf = IsolationForest(self.n_estimators, self.max_samples, self.contamination, self.max_feature, self.bootstrap, self.n_jobs, self.random_state, self.verbose)
        clf.fit(sample_features)
        predict_res = clf.predict(sample_features)
        if predict_res[-1] == -1:
            return 0
        return 1
def outlier_removal(df, col, method, params):
    if method == 'Isolation Forest':
        do_outlier_removal = IsolationForest(**params)
    if method == 'Local Outlier Factor':
        do_outlier_removal = LocalOutlierFactor(**params)
    else:
        method == None
    do_outlier_removal.fit(np.array(df[col]))
    if method == 'Isolation Forest':
        outlier_scores = do_outlier_removal.decision_function(np.array(df[col]))
        df[('meta', 'Outlier Scores - ' + method + str(params))] = outlier_scores
        is_outlier = do_outlier_removal.predict(np.array(df[col]))
        df[('meta', 'Outliers - ' + method + str(params))] = is_outlier
    if method == 'Local Outlier Factor':
        is_outlier = do_outlier_removal.fit_predict(np.array(df[col]))
        df[('meta', 'Outliers - ' + method + str(params))] = is_outlier
        df[('meta', 'Outlier Factor - ' + method + str(params))] = do_outlier_removal.negative_outlier_factor_
    return df, do_outlier_removal
Example #14
0
def single_user_analytics(user_df):
    """
    analyses the data of a user using the IsolationForest algorithm and returns outliers.
    :param user_df: dataframe should only have the columns: Method, Path, Id, Resource, Status, Src
    :type user_df: dataframe
    :return:
    :rtype: Series
    """
    print('analyzing')
    converted_user_df = user_df.drop("Date", 1)
    converted_user_df = pd.get_dummies(converted_user_df, columns=["Method", "Path", "Id", "Resource", "Status", "Src"])
    clf = IsolationForest(contamination=0, max_features=len(converted_user_df.columns.values))
    clf.fit(converted_user_df)
    pred = clf.predict(converted_user_df)
    temp_df = user_df
    temp_df['e'] = pd.Series(pred, user_df.index)
    novelty = user_df.loc[temp_df["e"] == -1]
    return novelty.index.values
Example #15
0
def apply_ISOForest(df, contamination=0.1, columns_to_use=[]):
    '''
    input -- pandas data frame with columns [No. Time Source Destination Protocol Length Info]

    output -- numpy array with {0, 1} where 1 is anomaly
    '''
    iso_frame = df.copy()

    iso_frame.drop(columns=['No.', 'Source', 'Destination', 'Info'],
                   inplace=True)
    iso_frame = pd.get_dummies(iso_frame, columns=['Protocol'])

    iso_frame['Time'] = list(map(int, iso_frame['Time'].values))

    names = []
    if type(columns_to_use) == str:
        names = ['Length', 'Protocol_' + columns_to_use]
    if type(columns_to_use) == list:
        if len(columns_to_use) == 0:
            names = list(iso_frame.columns)
            names.remove('Time')
        else:
            names2 = []
            for name in names:
                if len(name.split('_')) != 0:
                    continue
                elif name.split('_')[1] in columns_to_use:
                    name2.append('Protocol_' + name)
            names = names2
            names.append('Length')

    iso_frame2 = pd.DataFrame(columns=names)

    grouped = iso_frame.groupby('Time')

    for col in names:
        iso_frame2[col] = grouped[col].agg(np.sum)

    iso = IsolationForest(n_jobs=-1, n_estimators=20)
    iso.fit(iso_frame2)
    pred_out = iso.predict(iso_frame2)
    pred_out[pred_out == 1] = 0
    pred_out[pred_out == -1] = 1
    return pred_out
Example #16
0
def main():

    # Read all the csv files
    csvPath = "./csv_files"
    csvFiles = [f for f in listdir(csvPath) if isfile(join(csvPath, f))]

    dfs = []
    for cv in csvFiles:
        print("CSV Processing: " + cv)
        dfs.append(pd.read_csv(csvPath + '/' + cv, index_col=False))

    df = pd.concat(dfs, ignore_index=True)
    #df = df.drop('Unnamed: 0', axis=1)

    # Process all the csv file
    totalNormal = 0
    totalAnomalies = 0

    # Turn every column to numeric
    cols = [c for c in df.columns]

    nom_cols = ['ip_flags', 'tcp_udp_flags']
    for c in nom_cols:
        le = LabelEncoder()
        df[c] = le.fit_transform(df[c])

    # Use the isolation forest to find the anomalies -1: anomaly 1:normal
    clf = IsolationForest(n_estimators=10,
                          max_samples=int(0.2 * len(df['time_diff'])) + 1,
                          contamination='auto',
                          behaviour='new')
    clf.fit(df)

    df['label'] = clf.predict(df)

    totalNormal = len(df[df['label'] == 1])
    totalAnomalies = len(df[df['label'] == -1])
    print("Normal: " + str(totalNormal))
    print("Anomaly: " + str(totalAnomalies))
    df.to_csv('./processed_csv/' + 'processed_' + cv, index=False)

    #Save the model
    filename = 'model.sav'
    pickle.dump(clf, open(filename, 'wb'))
Example #17
0
def visualize():
    cluster_dataframe = pd.read_csv("data/cluster.csv")
    X = cluster_dataframe.drop(['connections'], axis=1)
    clusters = 4
    X = normalize(X)
    reduced_data = PCA(n_components=clusters).fit_transform(X)

    #Outlier Test
    model = IsolationForest(contamination=0.05)
    model.fit(reduced_data)
    outliers = model.predict(reduced_data)
    outlier_frame = pd.DataFrame()
    outlier_frame['connections'] = cluster_dataframe['connections']
    outlier_frame['X'] = reduced_data[:, 0]
    outlier_frame['Y'] = reduced_data[:, 1]
    outlier_frame['isOutlier'] = outliers
    normal_connection = outlier_frame.loc[outlier_frame.isOutlier == 1]
    anomalous_connection = outlier_frame.loc[outlier_frame.isOutlier == -1]

    data = [go.Scatter(
        x = normal_connection['X'],
        y = normal_connection['Y'],
        text = normal_connection['connections'],
        hoverinfo = 'text',
        name="Normal Connections",
        mode = 'markers',
        marker=dict(
            color = 'rgb(34,140,217)'
        )
    ), go.Scatter(
        x = anomalous_connection['X'],
        y = anomalous_connection['Y'],
        text = anomalous_connection['connections'],
        hoverinfo = 'text',
        name = "Anomalous Connections",
        mode = 'markers',
        marker=dict(
            color = 'rgb(235,82,82)'
        )
    ) 
    ]

    graphJSON = json.dumps(data, cls=plotly.utils.PlotlyJSONEncoder)
    return render_template('visualize.html', graphJSON=graphJSON)
Example #18
0
def anomaly_test(train_data, train_label, test_data, test_label):
    train_data = train_data
    train_label = train_label
    train_total = pd.concat([train_data, train_label], axis=1)
    normal_index = train_total[(train_total['Label'] == 1) == True].index
    train_total = train_total.loc[normal_index]
    test_data = test_data
    test_label = test_label
    test_label = labeling(test_label)

    train_total = train_total.drop(['Label'], axis=1)
    train_total = train_total.sample(frac=0.2, random_state=42)

    print("\nOne-class SVM")
    ocs = OneClassSVM(kernel="linear", gamma='auto')
    ocs.fit(train_total)
    ocs_pred = ocs.predict(test_data)
    df_ocs = pd.DataFrame(data=(test_label['Label']).to_numpy(),
                          columns=['actual'])
    df_ocs['predict'] = ocs_pred
    ocs_result = scoring(df_ocs)
    print(ocs_result)

    print("\nIsolation Forest")
    iforset = IsolationForest(max_samples=100,
                              contamination=0.1,
                              random_state=42)
    iforset.fit(train_total)
    iforse_pred = iforset.predict(test_data)
    df_iforse = pd.DataFrame(data=(test_label['Label']).to_numpy(),
                             columns=['actual'])
    df_iforse['predict'] = iforse_pred
    iforse_result = scoring(df_iforse)
    print(iforse_result)

    print("\nLocal Outlier Factor")
    lof = LocalOutlierFactor(n_neighbors=15)
    lof.fit(train_total)
    lof_pred = lof.fit_predict(test_data)
    df_lof = pd.DataFrame(data=(test_label['Label']).to_numpy(),
                          columns=['actual'])
    df_lof['predict'] = lof_pred
    lof_result = scoring(df_lof)
    print(lof_result)
Example #19
0
def Preprocessing_train():
    global col_train
    global col_train_bis
    global train
    global clf
    global prepro_y
    global prepro
    global mat_new
    global test
    global train_dataset
    # print (test)
    clf = IsolationForest(random_state=42)
    clf.fit(train)
    y_noano = clf.predict(train)
    y_noano = pd.DataFrame(y_noano, columns=['Top'])
    y_noano[y_noano['Top'] == 1].index.values

    # train = train.iloc[y_noano[y_noano['Top'] == 1].index.values]
    train.reset_index(drop=True, inplace=True)
    # print("Number of Outliers:", y_noano[y_noano['Top'] == -1].shape[0])
    # print("Number of rows without outliers:", train.shape[0])
    col_train = list(train.columns)
    col_train_bis = list(train.columns)
    col_train_bis.remove('DEM')
    mat_train = np.matrix(train)
    mat_test = np.matrix(test)
    mat_new = np.matrix(train.drop('DEM', axis=1))
    mat_y = np.array(train.DEM).reshape((len(train.DEM), 1))
    # preprocessing
    prepro_y = MinMaxScaler()
    prepro_y.fit(mat_y)
    prepro = MinMaxScaler()
    prepro.fit(mat_train)

    prepro_test = MinMaxScaler()
    prepro_test.fit(mat_new)
    # print (mat_test)
    train_dataset = pd.DataFrame(prepro.transform(mat_train), columns=col_train)

    test = pd.DataFrame(prepro_test.transform(mat_test), columns=col_train_bis)
    print(train_dataset)
    # print(test)

    return "Preprocessing"
Example #20
0
def outlier_prediction(x_train, y_train):
    # Use built-in isolation forest or use predicted vs. actual
    # Compute squared residuals of every point
    # Make a threshold criteria for inclusion

    # The prediction returns 1 if sample point is inlier. If outlier prediction returns -1
    rng = np.random.RandomState(42)
    clf_all_features = IsolationForest(max_samples=100, random_state=rng)
    clf_all_features.fit(x_train)

    # Predict if a particular sample is an outlier using all features for higher dimensional data set.
    y_pred_train = clf_all_features.predict(x_train)

    # Exclude suggested outlier samples for improvement of prediction power/score
    outlier_map_out_train = np.array(map(lambda x: x == 1, y_pred_train))
    x_train_modified = x_train[outlier_map_out_train, ]
    y_train_modified = y_train[outlier_map_out_train, ]

    return x_train_modified, y_train_modified
Example #21
0
def method_isolation_forest():
    X, Y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1)
    max_review_length = min(map(len, X))
    X_train = sequence.pad_sequences(X_train,
                                     maxlen=max_review_length,
                                     dtype=float)
    X_test = sequence.pad_sequences(X_test,
                                    maxlen=max_review_length,
                                    dtype=float)
    X_train = X_train[y_train == labels["Buy"]]
    model = IsolationForest(contamination=0.5)
    model.fit(X_train)
    yhat = model.predict(X_test)
    print("accuracy: ", metrics.accuracy_score(y_test, yhat))
    y_test[y_test == labels["NotBuy"]] = -1
    y_test[y_test == labels["Buy"]] = 1
    score = f1_score(y_test, yhat, pos_label=-1)
    print('F1 Score: %.3f' % score)
    def test_tvm_iforest_remainder_batch(self):
        warnings.filterwarnings("ignore")
        num_classes = 2
        model = IsolationForest(n_estimators=10, max_samples=2)
        np.random.seed(0)
        X = np.random.rand(105, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=105)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :], remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.decision_function(X), hb_model.decision_function(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.score_samples(X), hb_model.score_samples(X), rtol=1e-06, atol=1e-06)
Example #23
0
def datacleaning_if(input_file, output_file):

    with open(input_file, 'r') as f:
        # 1.创建阅读器对象
        reader = csv.reader(f)
        # 2.读取文件第一行数据
        head_row = next(reader)
    data_attribute = []
    for item in head_row:
        data_attribute.append(item)
    # 读取数据
    tn = pd.read_csv(input_file)
    tn.dropna(inplace=True)
    train = np.array(tn)
    train_x = train[:, :-1]  #输出评估结果时使用
    # 对所有数据行进行异常检测
    train_x = np.array(train_x)
    clf = IsolationForest(n_estimators=100,
                          max_samples='auto',
                          contamination=0.0001,
                          max_features=1.0,
                          bootstrap=False,
                          n_jobs=1,
                          random_state=None,
                          verbose=0).fit(train_x)
    # pred存入的是每一行数据的预测值,是1或者-1
    pred = clf.predict(train_x)
    normal = train_x[pred == 1]
    abnormal = train_x[pred == -1]
    # 删除pred为-1的行数据
    df = pd.DataFrame(pd.read_csv(input_file))[0:pred.size]
    df['pred'] = pred
    # data2=data1[-data1.sorce.isin([61])]
    df2 = df[-df.pred.isin([-1])]
    df2 = df2.drop(['pred'], axis=1)
    # 将清洗之后的数据存入csv文件
    data_out = df2.iloc[:, :].values
    csvfile2 = open(output_file, 'w', newline='')
    writer = csv.writer(csvfile2)
    writer.writerow(data_attribute)  # 存属性
    m = len(data_out)
    for i in range(m):
        writer.writerow(data_out[i])
Example #24
0
def remove_outliers (df, field, contamination=0.01, verbose=False):
    '''Function will run an Isolation Forest to determine values that 
    are outliers in the given field and will remove those data points 
    before returning a new dataframe.'''
    # Use a deep copy of data to avoid making changes to original
    X = df[field].copy(deep=True)
    X = X.values.reshape(-1, 1)
    # Prepare and fit the Isolation Forest
    IsoFor = IsolationForest(bootstrap=True, n_jobs=-1, contamination=contamination)
    IsoFor.fit(X)
    # Make predictions
    y_pred = IsoFor.predict(X)
    if verbose:
        num_outliers = np.unique(y_pred, return_counts=True)[1][0]
        print('{} outliers detected and removed from dataframe.'.format(num_outliers))
    # Truth value of non_outliers (equal to 1)
    non_outliers = y_pred == 1
    # Return new df
    return df[non_outliers].copy(deep=True)
def compute_scores(o, n_iterations, pdb_ids, ab_truth, ab_coord, ab_X, ab_X_weights, precision, recall):
    print outlier_fractions[o]
    forest = IsolationForest(contamination=outlier_fractions[o], n_jobs=4)
    for i in xrange(len(pdb_ids)) :
        print pdb_ids[i]
        current_precision = 0
        current_recall = 0
        for _ in xrange(n_iterations) :
            forest.fit(ab_X[i], sample_weight=ab_X_weights[i])
            patch_pred_no_outliers = forest.predict(ab_coord[i])
            p, r, _, _ = precision_recall_fscore_support(ab_truth[i], patch_pred_no_outliers, average='binary')
            current_precision += p
            current_recall += r
        current_precision /= n_iterations
        current_recall /= n_iterations
        precision[o] += current_precision
        recall[o] += current_recall
    precision[o] /= len(pdb_ids)
    recall[o] /= len(pdb_ids)
Example #26
0
def do_isoForest(df, kwargs=None):
    """ Runs an isolation forest looking for outliers.
    """

    if kwargs is None:
        kwargs = {
            # 'n_estimators': 1000,
            'behaviour': 'new',
            # 'max_samples': 1000,
            'random_state': 42,
            'contamination': 'auto',
            'max_features': 1
        }

    forest = IsolationForest(**kwargs).fit(df)

    predics = forest.predict(df)

    return forest, predics
Example #27
0
def outliers_isolation_forest(df, target_encoded, encoder=None, contamination=0.001):
    """Using 'Isolation Forest', filters the outliers data points

    Args:
        df (DataFrame): Source data
        target_encoded (str): Target column name
        encoder (obj, optional): Object of the type 'IsolationForest'. Defaults to None.
        contamination (float, optional): Threshold to remove the outliers. Defaults to 0.001.

    Returns:
        DataFrame: Same as source
    """
    if encoder is None:
        encoder = IsolationForest(contamination=contamination)
        y_pred = encoder.fit(df.drop([target_encoded], axis=1))
    y_pred = encoder.predict(df.drop([target_encoded], axis=1))
    mask = y_pred != -1
    df = df.loc[list(mask), :]
    return df, encoder
Example #28
0
def isolutionforest(DO):

    rng = np.random.RandomState(42)
    clf = IsolationForest(random_state=rng, contamination=0.025)  # contamination为异常样本比例
    clf.fit(DO)

    DO_copy = DO
    m = 0

    pre = clf.predict(DO)
    for i in range(len(pre)):
        if pre[i] == -1:
            DO_copy = np.delete(DO_copy, i - m, 0)
            plt.scatter(i,DO[i],c='red')
            print(i)
            m += 1
    # plt.plot(DO)
    # plt.show()
    return DO_copy
Example #29
0
def anomalies():
    cluster_dataframe = pd.read_csv("data/cluster.csv")
    X = cluster_dataframe.drop(['connections'], axis=1)
    clusters = 4
    X = normalize(X)
    reduced_data = PCA(n_components=clusters).fit_transform(X)

    #Outlier Test
    model = IsolationForest(contamination=0.05)
    model.fit(reduced_data)
    outliers = model.predict(reduced_data)
    outlier_frame = pd.DataFrame()
    outlier_frame['connections'] = cluster_dataframe['connections']
    outlier_frame['X'] = reduced_data[:, 0]
    outlier_frame['Y'] = reduced_data[:, 1]
    outlier_frame['isOutlier'] = outliers
    normal_connection = outlier_frame.loc[outlier_frame.isOutlier == 1]
    anomalous_connection = outlier_frame.loc[outlier_frame.isOutlier == -1]
    return render_template('anomalies.html', items=list(anomalous_connection['connections']))
Example #30
0
def test_isolationforest():
    # Load data
    X, _ = make_blobs(n_samples=400,
                      centers=[[0, 0], [0, 0]],
                      cluster_std=0.5,
                      n_features=2,
                      random_state=42)
    X_outlier = np.random.RandomState(42).uniform(low=-6, high=6, size=(50, 2))

    # Create and fit model
    model = IsolationForest(random_state=42)
    model.fit(X)

    # Compute counterfactuals
    x = X[0, :]
    y_target = -1
    assert model.predict([x]) == 1

    x_cf, y_cf, _ = generate_counterfactual(model,
                                            x,
                                            y_target=y_target,
                                            return_as_dict=False)
    assert y_cf == y_target
    assert model.predict(np.array([x_cf])) == y_target

    x = X_outlier[1, :]
    y_target = 1
    assert model.predict([x]) == -1

    x_cf, y_cf, _ = generate_counterfactual(model,
                                            x,
                                            y_target=y_target,
                                            return_as_dict=False)
    assert y_cf == y_target
    assert model.predict(np.array([x_cf])) == y_target

    cf = generate_counterfactual(model,
                                 x,
                                 y_target=y_target,
                                 return_as_dict=True)
    assert cf["y_cf"] == y_target
    assert model.predict(np.array([cf["x_cf"]])) == y_target

    # Other stuff
    from ceml.sklearn import IsolationForest as IsolationForestCf
    model_cf = IsolationForestCf(model)
    assert model.predict([x]) == model_cf.predict(x)

    with pytest.raises(TypeError):
        IsolationForestCf(sklearn.linear_model.LogisticRegression())
Example #31
0
def runIF(X, Y, model):
    """
    Isolation Forest model ...

    Inputs
    ------
    X: {array-like, matrix}, shape = [n_samples, n_features]
    Y: {1-D array-like} shape must equal number of rows in X
    model: output of filterK.kMeansModel function including: k-means model and 
        normalization and contains list with following elements:
            out[0] = object of class 'sklearn.cluster.k_means_.KMeans'
            out[1] = shift for data normalisation
            out[2] = scale for data normalisation

    Outputs
    -------
    return following
        mask_outliers = all data labelled as outlier (-1) by IF
        y_pred is cluster label for each point
    """

    X = np.array(X)
    Y = np.array(Y)

    # Check parameters
    if Y.shape[0] != X.shape[0]:
        raise Exception('X dataset shape does not match Y')
    else:
        pass

    # Normalise data
    X = X * model[2] + model[1]

    clf = model[0]
    y_pred = clf.predict(X)

    IF = IsolationForest(n_estimators=500, max_samples='auto', random_state=0)
    IF.fit(X)
    if_apply = IF.predict(X)
    mask_outliers = (if_apply == -1)

    return [mask_outliers, y_pred]
class iForest():
    MAX_N_SAMPLES = 32000

    def __init__(self, max_number_of_samples=None, outliers_fraction=0.1, n_estimators=100):
        self.max_number_of_samples = max_number_of_samples if max_number_of_samples else self.MAX_N_SAMPLES
        self.outliers_fraction = outliers_fraction
        self.n_estimators = n_estimators
        self.classifier = IsolationForest(n_estimators=self.n_estimators,max_samples=self.max_number_of_samples,
                                          contamination=self.outliers_fraction, random_state=None)

    def train(self, train_data):

        n_train_samples = train_data.shape[0]
        train_data = train_data.reshape(n_train_samples, -1)

        if n_train_samples > self.max_number_of_samples:
            logging.warning(
                'Discarding training data: using {} of {} chunks.'.format(self.max_number_of_samples, n_train_samples))
            train_data = self._subsample_data(train_data)

        self.classifier.fit(train_data)

    def predict(self, test_sample):
        data = test_sample.reshape(test_sample.data.shape[0], -1)
        prediction = self.classifier.predict(data)
        return prediction

    def decision_function(self, test_sample):
        data = test_sample.data.reshape(test_sample.data.shape[0], -1)
        anomaly_score = self.classifier.decision_function(data)
        return np.squeeze(anomaly_score)

    def _subsample_data(self, data):
        return data[np.random.choice(data.shape[0], self.max_number_of_samples, replace=False)]

    @property
    def configuration(self):
        return {
            'max_number_of_samples': self.max_number_of_samples,
            'outliers_fraction': self.outliers_fraction,
            'n_estimators': self.n_estimators
        }
Example #33
0
def check_relative_anomaly_score(real_data=None,
                                 test_data=None,
                                 DIR=None,
                                 domain_dims=None):

    _cur_path_ = os.path.abspath(__file__).replace('.py', '')
    _cur_path_ = '/'.join(_cur_path_.split('/')[:-1])
    save_dir = os.path.join(_cur_path_, 'saved_model/{}'.format(DIR))

    path = Path(save_dir)
    path.mkdir(parents=True, exist_ok=True)
    f_path = os.path.join(save_dir, 'ad_if.pkl')
    print(f_path)
    AD_obj = None
    if not os.path.exists(f_path) and real_data is not None:
        AD_obj = IsolationForest(n_estimators=100,
                                 contamination=0.01,
                                 n_jobs=mp.cpu_count(),
                                 verbose=True)
        # Convert real data to one-hot encoded
        oh_data = convert_to_01(real_data, domain_dims)
        AD_obj.fit(oh_data)
        print("Model fitting done.")
        pickle.dump(AD_obj, open(f_path, 'wb'), pickle.HIGHEST_PROTOCOL)

    elif os.path.exists(f_path):

        AD_obj = pickle.load(open(f_path, 'rb'))
    print(AD_obj)

    if test_data is not None and AD_obj is not None:
        print(test_data.shape)

        oh_data = convert_to_01(test_data, domain_dims)
        print(oh_data.shape)

        # -1 for outliers
        y = AD_obj.predict(oh_data)
        # percentage of data points predicted as anomalies
        count_outliers = np.where(y == -1.0)[0].shape[0]
        data_len = test_data.shape[0]
        return (count_outliers / data_len)
def findBestModel(X_train, X_test, Y_test, model):
    """ Function to find the best parameters to use for a given model 
    components: 
    X_train: numpy array of the input data
    X_test: list containing numpy arrays of different test data
    Y_test: list containing numpy array of different test outcomes (note that this is configured differently 
    for different algorithms,for iForest, each column must have -1 or 1. -1 --> the anomaly, if 1 --> not an anomaly)
    model: string to determine model type
    """
    if model == 'iForest':
        for max_features in range(1, X_train.shape[1] + 1):
            for contamination in range(0, 101):
                iForest = IsolationForest(n_estimators=100,
                                          max_features=max_features / 1000,
                                          contamination=contamination,
                                          random_state=0).fit(X_train)
                for x_test, y_test in zip(X_test, Y_test):
                    y_hat = iForest.predict(x_test)
                    score = evaluate(y_test, y_hat)  # returns accuracy score
                    print(score)
Example #35
0
def compute_outliers(metrics, test_metrics, metric_key, sort_key_list, es):
    train = metrics[metric_key]  # [:-test_samples]
    test = test_metrics[metric_key]  #[-test_samples:]
    test_time_stamps = sort_key_list

    X_train = pd.DataFrame(train, columns=['sample'])
    X_test = pd.DataFrame(test, columns=['sample'])

    clf = IsolationForest(max_samples=1000)
    clf.fit(X_train)
    outliers = clf.predict(X_test)

    for i in range(0, len(test_time_stamps)):
        ## Store if the metric sampled at a particular time stamp was an anomaly or not
        jsonString = '{"' + sort_key + '": "' + test_time_stamps[
            i] + '", "metrics":"' + metric_key + '", "value":' + str(
                test[i]) + ',"outlier":' + str(outliers[i]) + '}'
        es.index(index=anomalies_index,
                 doc_type='predicted',
                 body=json.loads(jsonString))
Example #36
0
 def test_isolation_forest_score_samples(self):
     isol = IsolationForest(n_estimators=3, random_state=0)
     data = np.array([[-1.1, -1.2], [0.3, 0.2], [0.5, 0.4], [100., 99.]],
                     dtype=np.float32)
     model = isol.fit(data)
     model_onnx = to_onnx(model,
                          data,
                          target_opset=TARGET_OPSET,
                          options={'score_samples': True})
     sess = InferenceSession(model_onnx.SerializeToString())
     names = [o.name for o in sess.get_outputs()]
     self.assertEqual(names, ['label', 'scores', 'score_samples'])
     got = sess.run(None, {'X': data})
     self.assertEqual(len(got), 3)
     expected_label = isol.predict(data)
     expected_decif = isol.decision_function(data)
     expected_score = isol.score_samples(data)
     assert_almost_equal(expected_label, got[0].ravel())
     assert_almost_equal(expected_decif, got[1].ravel())
     assert_almost_equal(expected_score, got[2].ravel())
Example #37
0
class IF_real_bogus:
    def __init__(self, feature):
        self.feature = feature
        #self.Class = []
        #[self.Class.append('real') for i in range(len(self.feature))]
        #self.trainning, self.test, self.Class_train, self.Class_test = train_test_split(self.real_feature,\
        #                        self.real_Class, test_size=0.4, random_state=0)
        #print self.trainning
    def train(self):
        self.IFmod = IF(n_estimators=160)
        self.IFmod.fit(self.feature)

    def validation(self):
        #score = self.IFmod.decision_function(self.feature)
        result = self.IFmod.predict(self.feature)
        self.normal = (result == 1)
        self.abnormal = (result == -1)
        print "total:{0}, normal:{1}, abnormal:{2}, normal/total:{3}".format(
            len(self.feature), self.normal.sum(), self.abnormal.sum(),
            self.normal.sum() / float(len(self.feature)))
Example #38
0
def feature_engineering(datasets):
    #generate some features here, need discuss

    #read the file
    train = pd.read_csv(datasets["train"])
    train_X = train.iloc[:, :-1]
    train_y = train.TARGET

    # add outlier feature
    test = pd.read_csv(datasets["test"])
    n = train_X.shape[0]
    clf = IsolationForest(random_state=42)
    clf.fit(train_X)
    outlier = clf.predict(train_X)
    train_X = pd.DataFrame(train_X)
    train_X["outlier"] = outlier

    #add xxx

    return train_X, train_y, test
Example #39
0
def outlier_detection(df):
    X = df.drop('AdoptionSpeed', axis=1)
    Y = df['AdoptionSpeed']

    # Isolation Forest
    clf = IsolationForest(contamination=0.08, max_samples=256,
                          behaviour="new")  # 8% outliers
    clf.fit(X)
    processedData1 = clf.predict(X) == -1

    print("Isolation Forest: ", sum(processedData1))

    clf = LocalOutlierFactor(contamination='auto')
    processedData2 = clf.fit_predict(X) == -1
    print("LOF:", sum(processedData2))

    newData = df.loc[processedData1 == 0]
    newData = newData.reset_index(drop=True)

    return newData
Example #40
0
def remove_outliers(df_train):
    """
    :objective: Remove outliers. Before dividing into X/y
    :return: pandas dataframe
    """
    numeric_colnum = df_train.columns.get_indexer(['판매단가', '취급액']).tolist()
    feature_set = df_train.iloc[:, numeric_colnum]
    # identify outliers in the training dataset
    iso = IsolationForest(n_estimators=50,
                          max_samples=50,
                          contamination=float(0.05),
                          max_features=1.0)
    iso.fit(feature_set)
    pred = iso.predict(feature_set)
    feature_set['anomaly'] = pred
    outliers = feature_set.loc[feature_set['anomaly'] == -1]
    outlier_index = list(outliers.index)
    df_train = df_train.loc[~df_train.index.isin(outlier_index)].reset_index()

    return df_train
Example #41
0
 def training_oulier_alldata(self, data, t_data, outlier_features,
                             complete_row_idx, missing_lines):
     complete_data = data[outlier_features].iloc[complete_row_idx, :]
     test_data = t_data[outlier_features]
     all_data = pd.concat([complete_data, test_data],
                          axis=0,
                          ignore_index=True)
     ilf = IsolationForest(n_estimators=min(100, len(all_data)),
                           n_jobs=-1,
                           verbose=2)
     ilf.fit(all_data)
     if len(complete_data) > 0:
         pred = ilf.predict(complete_data)
         outlier_idx = [
             complete_row_idx[i] for i in np.where(pred == -1)[0]
         ]
         # put these outlier data into the missing list
         for k, v in missing_lines.items():
             missing_lines[k] = np.append(v, outlier_idx)
     return ilf, missing_lines
Example #42
0
def find_circle_R(kmeans):
    R = []
    centers = []
    for i in range(EMOTION_NUM):
        clf = IsolationForest(max_samples=60)
        random_idx = np.random.permutation(range(len(kmeans[i])))
        X_train = kmeans[i][random_idx]
        clf.fit(X_train)
        pred_outliers = clf.predict(np.array(kmeans[i]))
        kmeans_in_circle = kmeans[i][pred_outliers == 1]
        center = np.mean(kmeans_in_circle, 0)
        centers.append(center)
        d_point_center = []
        for j in range(len(kmeans_in_circle)):
            d_point_center.append(
                nltk.cluster.util.cosine_distance(
                    center, kmeans_in_circle[j].tolist()))
        R.append(np.sort(d_point_center)[len(d_point_center) - 1])

    return R, centers
Example #43
0
def test_iforest_performance():
    """Test Isolation Forest performs well"""

    # Generate train/test data
    rng = check_random_state(2)
    X = 0.3 * rng.randn(120, 2)
    X_train = np.r_[X + 2, X - 2]
    X_train = X[:100]

    # Generate some abnormal novel observations
    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
    X_test = np.r_[X[100:], X_outliers]
    y_test = np.array([0] * 20 + [1] * 20)

    # fit the model
    clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)

    # predict scores (the lower, the more normal)
    y_pred = clf.predict(X_test)

    # check that there is at most 6 errors (false positive or false negative)
    assert_greater(roc_auc_score(y_test, y_pred), 0.98)
Example #44
0
        data = data.join(vecData)
    return data, vecData, vec

df, t, v = ohEncoding(df, col, replace=True)

print "Shape after encoding"
print type(df.shape)

df_unlabeled = df.drop("Anomaly", axis=1)
print "Shape of the dataframe without anomaly column: "
print df_unlabeled.shape

clf = IsolationForest(max_samples=6444, verbose=1, n_jobs=-1, contamination=0.255555
                      , bootstrap=True, max_features=9)
clf.fit(df_unlabeled)
pred = clf.predict(df_unlabeled)
# print type(pred)
# print data.shape
# print len(pred)
# print pred
anomalies = np.argwhere(pred == -1)
normal = np.argwhere(pred == 1)
# print anomalies
# print type(anomalies)

df['ISO1'] = pred

# iterate over rows
nLabAno = 0
nDetAno = 0
nFalsePositives = 0
Example #45
0
# ## Improving the Predicition model ##
# This part is about finding a better metric for predicting future house sales regarding their price.
# 
# First, I will detect outliers and delete them from the dataset if needed.

# ### Detecting Outliers ###
# The first step to improve our learning behaviour is to find outliers and then remove them from the data set if needed.
# To detect outliers I will use the Isolation Forest Algorithm which is good for high-dimensional data sets as we have present here. 

# In[ ]:

from sklearn.ensemble import IsolationForest

clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(df)
y = clf.predict(df)
print y


# ### Location based prices ###
# House prices don't only depend on the size of the house or amount of rooms, but are also really dependant on the location of said house. To get an idea how the position might impact my data I analyse the relationship between location and price in my dataset.

# In[ ]:

import gmaps
gmaps.configure(api_key="AIzaSyDPWAl8lcrK9q-tOkrl64sGkxDnbWz47Ko")

locations = df[["lat", "long"]]
prices = df["price"]

heatmap_layer = gmaps.heatmap_layer(locations, weights=prices)
rng = np.random.RandomState(42)

# Generate train data
X = 0.3 * rng.randn(100, 2)
X_train = np.r_[X + 2, X - 2]
# Generate some regular novel observations
X = 0.3 * rng.randn(20, 2)
X_test = np.r_[X + 2, X - 2]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))

# fit the model
clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)

# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("IsolationForest")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white')
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green')
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red')
plt.axis('tight')
    Xtrain.append(column_train)
    Xtest.append(column_test)

Xtrain = np.transpose(np.array(Xtrain))
Xtest = np.transpose(np.array(Xtest))
idx_train = idx_train[:Xtrain.shape[0]]
idx_test = idx_test[:Xtest.shape[0]]

# fit an iforest
iforest =  IsolationForest(n_estimators=ntrees,
                           max_samples=sample_frac, max_features=feat_frac,
                           n_jobs=-1, random_state=rng, verbose=1)
iforest.fit(Xtrain)

# anomaly scores
y_pred_train = iforest.predict(Xtrain)
y_pred_test = iforest.predict(Xtest)
train_feature_values = [(gid, val)
                        for gid, val in zip(idx_train, list(y_pred_train))]
test_feature_values = [(gid, val)
                        for gid, val in zip(idx_test, list(y_pred_test))]
for i, scenario in enumerate(MALICIOUS_SCENARIOS):
    all_feature_values = train_feature_values + \
                         [(gid, feat_value)
                          for gid, feat_value in test_feature_values
                          if gid/100 in BENIGN_SCENARIOS or
                             gid/100 == scenario]
    all_values = np.array([feat_value
                           for gid, feat_value in all_feature_values]).reshape(-1,1)
    y_true = [1 if gid/100 in MALICIOUS_SCENARIOS else 0
              for gid, feat_value in all_feature_values]
        # merge
        vehicle = pd.merge(rpm, speed, how = 'outer', on = 'timestamp')

        # drop null values and zero speeds --> neutral gear
        # speed < 200 to remove outliers

        vh = vehicle.dropna(axis = 0)
        vh = vh[(vh['rpm'] > 0) & ((vh['speed'] > 0) & (vh['speed'] < 200))]

        # detect outliers using IsolationForest
        # assume contamination at 0.01 level

        distances = pairwise_distances(vh[['rpm','speed']],vh[['rpm','speed']], metric = 'cosine')
        clf = IsolationForest(max_samples = 100, contamination = 0.01, verbose = 1)
        clf.fit(distances)
        labels = clf.predict(distances)
        vh['outlier'] = labels

        # remove outliers found by IsolationForest
        vh = vh[['rpm','speed']][vh['outlier'] == 1]

        #recompute distances after outlier removal
        distances = pairwise_distances(vh[['rpm','speed']],vh[['rpm','speed']], metric = 'cosine')

        # initialize variable to keep best model, its silhouette score and predicted labels
        best_model = (None, -1, None)

        # iterate over possible number of gears
        # since we want to pick model with best silhouette score, can't start with single cluster (k=1)

        for k in range(2,7):
    n_samples, n_features = np.shape(X)
    n_samples_train = n_samples // 2
    n_samples_test = n_samples - n_samples_train

    X = X.astype(float)
    X_train = X[:n_samples_train, :]
    X_test = X[n_samples_train:, :]
    y_train = y[:n_samples_train]
    y_test = y[n_samples_train:]

    print('IsolationForest processing...')
    model = IsolationForest(bootstrap=True, n_jobs=-1)
    tstart = time()
    model.fit(X_train)
    fit_time = time() - tstart
    tstart = time()

    scoring = model.predict(X_test)  # the lower, the more normal
    predict_time = time() - tstart
    fpr, tpr, thresholds = roc_curve(y_test, scoring)
    AUC = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label='ROC for %s (area = %0.3f, train-time: %0.2fs, test-time: %0.2fs)' % (dat, AUC, fit_time, predict_time))

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
    #create the isolation forest class and factorize the class column
    clf = IsolationForest(n_estimators=opts.numtrees)


    #train the isolation forest on the training set, dropping the class column (since the trainer takes that as a separate argument)
    print('\nTraining')
    clf.fit(train.drop('class', axis=1))

    #remove the 'answers' from the test set
    testnoclass = test.drop('class', axis=1)

    print('\nPredicting (class 1 is normal, class -1 is malicious)')

    #evaluate our results on the test set.
    test.is_copy = False
    test['prediction'] = clf.predict(testnoclass)
    print

    #group by class (the real answers) and prediction (what the forest said). we want these values to match for 'good' answers
    results=test.groupby(['class', 'prediction'])
    resultsagg = results.size()
    print(resultsagg)

    tp = float(resultsagg[-1,-1]) if (-1,-1) in resultsagg.index else 0
    fp = float(resultsagg[1,-1]) if (1,-1) in resultsagg.index else 0
    fn = float(resultsagg[-1,1]) if (-1,1) in resultsagg.index else 0
    f1 = 2*tp/(2*tp + fp + fn)
    print("F1 = %s" % f1)

    #save the vectorizers and trained RF file
    joblib.dump(vectorizers, opts.vectorizerfile)