Example #1
0
def outliers(X_train, X_test=None, features=None, name=''):

    for i, model in enumerate(
        [IsolationForest, EllipticEnvelope, OneClassSVM, LocalOutlierFactor]):

        model = IsolationForest(contamination=0.1)
        if features:
            pred = model.fit_predict(X_train[features])
        else:
            pred = model.fit_predict(X_train)

        X_train[f"{name}_outlier_detection_{i}"] = pred

        if X_test is not None:
            if features:
                pred = model.predict(X_test[features])
            else:
                pred = model.predict(X_test)

            X_test[f"{name}_outlier_detection_{i}"] = pred

    if X_test is not None:
        return X_train, X_test
    else:
        return X_train
Example #2
0
def detect_outliers_IF(df, n_estimators=100):
    ''' Returns the outlier scores using IsolationForest

    Parameters:
    -----------
    df: pd.DataFrame,
    '''
    clf = IsolationForest(n_estimators=n_estimators, contamination=0.1, random_state=123)
    clf.fit_predict(df)
    scores = clf.score_samples(df)
    # dec_func = clf.decision_function(df_imputed)
    return scores
def generateData(filename_train, filename_test):
    data = pd.read_csv(filename_train, header=None)
    test_data = pd.read_csv(filename_test, header=None)

    train_data = data.iloc[:, :-2]
    test_data = test_data.iloc[:, :-1]

    #normalize the dataset
    scalar = MinMaxScaler()
    train_data = scalar.fit(train_data).transform(train_data)
    #new_test_data=scalar.fit_transform(test_data)
    scalar_test = StandardScaler()
    test_data = scalar_test.fit_transform(test_data)

    rng = np.random.RandomState(10)
    clf = IsolationForest(n_estimators=200,
                          behaviour='new',
                          max_samples=200,
                          max_features=5,
                          random_state=rng,
                          contamination='auto')
    clf.fit(train_data)  #train data
    pre_label = clf.fit_predict(test_data)
    print(pre_label)
    count = 0
    index_numbers = []
    for index, i in enumerate(pre_label):
        if (i == -1):
            count += 1
            index_numbers.append(index)
    print(index_numbers)
    print('The negative sample is :', count)
    return index_numbers
Example #4
0
def calculateKNNgraphDistanceMatrixML(featureMatrix,
                                      distanceType='euclidean',
                                      k=10,
                                      param=None):
    r"""
    Thresholdgraph: KNN Graph with Machine Learning based methods

    IsolationForest
    https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html#sklearn.ensemble.IsolationForest 
    """

    distMat = distance.cdist(featureMatrix, featureMatrix, distanceType)
    edgeList = []

    # parallel: n_jobs=-1 for using all processors
    clf = IsolationForest(behaviour='new', contamination='auto', n_jobs=-1)

    for i in np.arange(distMat.shape[0]):
        res = distMat[i, :].argsort()[:k + 1]
        preds = clf.fit_predict(featureMatrix[res, :])
        for j in np.arange(1, k + 1):
            # weight = 1.0
            if preds[j] == -1:
                weight = 0.0
            else:
                weight = 1.0
            #preds[j]==-1 means outliner, 1 is what we want
            edgeList.append((i, res[j], weight))

    return edgeList
def variance_contrast(X, k=3, contamination=0.01):
    X = StandardScaler().fit_transform(X)
    pca = PCA(n_components=None, random_state=2018)  
    pca.fit(X)
    # variance_original为各主成分对应的特征值,即样本在主成分空间内的投影方差
    variance_original = pca.explained_variance_
    
    # 运用孤立森林进行异常检测,得到异常样本索引anomaly_indices
    iforest = IsolationForest(contamination=contamination, random_state=2018, n_jobs=-1, behaviour="new")  
    anomaly_pred = iforest.fit_predict(X)
    anomaly_indices = np.argwhere(anomaly_pred==-1).ravel()
    
    # 删除异常样本,得到矩阵X_trimmed
    X_trimmed = X[np.isin(range(len(X)), anomaly_indices, invert=True)]
    # 对X_trimmed进行PCA,求得特征值variance_revised
    pca.fit(X_trimmed)
    variance_revised = pca.explained_variance_
    
    # 对删除异常样本前后的特征值进行对比
    delta_ratio = (variance_revised - variance_original) / variance_original
    # 只选取delta_ratio中的负数,确保对应特征值是下降的
    target_ratio = delta_ratio[delta_ratio < 0]
    # k为预设参数,表示选取特征值减小幅度最大的前k个索引
    if len(target_ratio) >= k: 
        indices_desc_topk = np.argsort(target_ratio)[:k]
    else:
        indices_desc_topk = np.argsort(target_ratio)[:len(target_ratio)]
    
    # min_max_idx为最小与最大特征值对应的索引
    min_max_idx = [0,  X.shape[1]-1] 
    # 验证min_max_idx之中是否有任何一个索引出现在indices_desc_topk中
    bool_result = any(np.isin(min_max_idx, indices_desc_topk))
    return indices_desc_topk, bool_result
def model_iF(data):
    from sklearn.ensemble import IsolationForest
    
    iF = IsolationForest(random_state=0)
    y_pred = iF.fit_predict(data["X_test"])
    
    return y_pred
class ISF(object):
    def __init__(self, file_name, config):
        self.dataset = config.dataset
        self.file_name = file_name

        self.x_dim = config.x_dim

        self.n_estimators = config.n_estimators
        self.max_samples = config.max_samples
        self.bootstrap = config.bootstrap
        self.max_features = config.max_features
        self.contamination = config.contamination

        self.pid = config.pid

        self.model = IsolationForest(n_estimators=self.n_estimators,
                                     max_samples=self.max_samples,
                                     bootstrap=self.bootstrap,
                                     max_features=self.max_features,
                                     contamination=self.contamination)

    def fit(self, train_input, train_label, test_input, test_label):
        y_pred = self.model.fit_predict(train_input)
        decision_function = self.model.decision_function(train_input)

        isf_output = ISFOutput(y_hat=y_pred,
                               decision_function=decision_function)

        return isf_output
 def isolation_criterion(self):
     clf = IsolationForest(behaviour="new",
                           max_samples=100,
                           random_state=1,
                           contamination="auto")
     preds = clf.fit_predict(self.p_table)
     return preds
Example #9
0
def scan(leagues, positions, transfer_fee, wage, age):
    df = load_data()

    df = df[df["age"] <= age]
    if all_name not in leagues:
        df = df[df["league"].isin(leagues)]
    df = df[(df["Value"] <= transfer_fee) & (df["Wage"] <= wage)]

    df["filter_positions"] = df.apply(
        lambda row: filter_positions(row, positions), axis=1)
    search_space = df.loc[df["filter_positions"] == True]
    search_space.reset_index(drop=True, inplace=True)

    # find outliers here
    # Returns -1 for outliers and 1 for inliers.

    X = search_space[possible_columns_to_compare].to_numpy()
    clf = IsolationForest(random_state=42, n_jobs=-1)
    search_space["label"] = pd.Series(list(clf.fit_predict(X)))
    search_space["score"] = pd.Series(list(clf.score_samples(X)))

    # The anomaly score of the input samples. The lower, the more outlier.
    search_space.sort_values(by=["score"], inplace=True)

    return search_space
Example #10
0
    def addSeg(self, seg1, seg2, N):
        self.overSeg[np.logical_or(self.overSeg == seg1,
                                   self.overSeg == seg2)] = N
        temp = self.overSeg == N
        self.segL[N] = np.mean(self.lab[:, :, 0][temp])
        self.segA[N] = np.mean(self.lab[:, :, 1][temp])
        self.segB[N] = np.mean(self.lab[:, :, 2][temp])

        for cnnRatioIdx, cnnRes in enumerate(self.allRatioCnnRes):
            self.clfList[N][cnnRatioIdx] = []
            self.segVectors[N][cnnRatioIdx] = []
            self.segClustersL2[N][cnnRatioIdx] = []
            for layerNum, outLayer in enumerate(cnnRes):

                v = outLayer[temp]
                if self.ifFilter:

                    clf = IsolationForest(behaviour='new',
                                          max_samples=max(
                                              4, v.shape[0] // self.clfSplit),
                                          n_estimators=self.n_trees,
                                          random_state=0,
                                          contamination='auto',
                                          n_jobs=1)
                    v_new = v[clf.fit_predict(v) == 1, :]
                    if v_new.shape[0] != 0:
                        v = v_new
                if v.shape[0] > self.maxSize:
                    np.random.seed(0)
                    v = v[np.random.randint(v.shape[0], size=self.maxSize)]
                self.segVectors[N][cnnRatioIdx].append(v)
                cl = np.mean(v, axis=0)
                self.segClustersL2[N][cnnRatioIdx].append(cl)
Example #11
0
def main():
    data = pd.read_csv("data.csv")

    for idx, row in data.iterrows():
        data.at[idx, 'ML'] = wordToNumber(row['ML'])
        data.at[idx, 'DW'] = wordToNumber(row['DW'])

    mldw_data = data.loc[:, 'ML':].values

    clf = IsolationForest(behaviour='new',
                          max_samples=28,
                          random_state=1,
                          contamination='auto')
    preds = clf.fit_predict(mldw_data)

    # print(data['NAMA'][1])

    i = 0
    outliers_idx = []
    for predict in preds:
        i += 1
        if predict == -1:
            outliers_idx.append(i)

    for i in range(0, len(preds)):
        j = i + 1
        if j in outliers_idx:
            plt.scatter(mldw_data[i][0], mldw_data[i][1], c="red")
            plt.annotate(data['NAMA'][i], (mldw_data[i][0], mldw_data[i][1]))
        else:
            plt.scatter(mldw_data[i][0], mldw_data[i][1], c="black")

    plt.show()
def remove_outliers(train_x, train_y):
    iso = IsolationForest(contamination='auto')
    yhat = iso.fit_predict(train_x)
    print("Removed " + str(yhat.sum()) + " outliers")
    mask = yhat != -1
    train_x, train_y = train_x[mask, :], train_y[mask]
    return train_x, train_y
Example #13
0
def isof(x, y):
    """This will be our function used to resample our dataset."""
    print('Initiating Outlier detection')
    model = IsolationForest()
    y_pred = model.fit_predict(x)
    print('isof: Outliers removed', x[y_pred == -1].shape[0])
    return x[y_pred == 1], y[y_pred == 1]
Example #14
0
def isolation_forest_outlier_removal(X,
                                     y,
                                     seed,
                                     n_estimators=150,
                                     max_samples=0.8,
                                     max_features=0.8,
                                     contamination="auto"):

    clf = IsolationForest(n_estimators=n_estimators,
                          max_samples=max_samples,
                          contamination=contamination,
                          max_features=max_features,
                          behaviour="new",
                          random_state=seed,
                          n_jobs=-1)

    results = clf.fit_predict(X.values)

    outliers = 0
    for r in results:
        if r == -1:
            outliers += 1

    removing_indices = [i for i in range(0, len(results)) if results[i] == -1]
    X_train_new = X.drop(X.index[removing_indices])
    y_train_new = [y[yi] for yi in range(0, len(y)) if results[yi] == 1]

    return X_train_new, y_train_new
def dixon():
    try:
        data = np.array(request.json["Data"])
        params = request.json['Params']

        n_estimators = 100
        max_samples = "auto"
        contamination = "auto"

        if "n_estimators" in params:
            n_estimators = params["n_estimators"]

        if "max_samples" in params:
            max_samples = params["max_samples"]

        if "contamination" in params:
            contamination = params["contamination"]

        clf = IsolationForest(n_estimators=n_estimators,
                              max_samples=max_samples,
                              contamination=contamination)

        indices = clf.fit_predict(data)
        indices = [0 if x == 1 else 1 for x in indices.tolist()]
        if params['ReturnDoubles']:
            indices = -clf.score_samples(data)
            indices = indices.tolist()
        return jsonify({"message": "OK", "data": indices})
    except Exception as e:
        return jsonify({"message": str(e)}), 400
    def grid_search(self):
        print("============ Starting perceptron grid search ============")
        best_accuracy = 0
        best_var_smoothing = None
        best_contamination = None

        for var_smoothing_i in np.linspace(0.001, 1, 5):
            self.var_smoothing_i = var_smoothing_i

            for contamination_i in np.linspace(0, 0.5, 20):
                iso = IsolationForest(contamination=contamination_i)
                yhat = iso.fit_predict(self.original_x_train)
                mask = yhat != -1
                self.x_train, self.t_train = self.original_x_train[
                    mask, :], self.original_t_train[mask]

                self.classifier = GaussianNB(var_smoothing=var_smoothing_i)
                mean_cross_validation_accuracy = self.cross_validation()

                if mean_cross_validation_accuracy == 100:
                    print(
                        "All train data was correctly classified during cross-validation !"
                    )

                if mean_cross_validation_accuracy > best_accuracy:
                    best_accuracy = mean_cross_validation_accuracy
                    best_var_smoothing = var_smoothing_i
                    best_contamination = contamination_i

        print(
            "Grid Search final hyper-parameters :\n"
            "     best_var_smoothing=", best_var_smoothing,
            "\n" + "     best_contamination=", best_contamination)

        return best_var_smoothing, best_contamination
def outlier_smoothing(X,
                      contamination=0.15,
                      smoothing_window=4,
                      plot=True,
                      random_state=22,
                      verbose=True):
    """
    Outlier identification by IForest and 
    smoothing by rolling window median value
    """
    X_rolling_median = X.rolling(smoothing_window).median()
    X_rolling_mean = X.rolling(smoothing_window).mean()
    X_smoothing_ratio = X / X_rolling_median

    if plot:
        plt.figure(figsize=(10, 10))
        plt.plot(X.index, X, label='original')
        plt.plot(X.index, X_rolling_median, label='rolling median')
        plt.title("Original vs. Rolling Median")
        plt.legend()
        plt.show()

        plt.figure(figsize=(10, 10))
        plt.plot(X.index, X_smoothing_ratio, label="original:smoothing ratio")
        plt.title("Smoothing Ratio")
        plt.legend()
        plt.show()

    ## Find the outliers
    iso_forest = IsolationForest(contamination=contamination,\
        random_state=random_state)
    peaks = np.where(iso_forest.fit_predict(X_smoothing_ratio[smoothing_window-1:].\
        values.reshape(-1,1))<1)
    if verbose:
        print("Outliers found at ", X.index[peaks[0] + smoothing_window - 1])
    if plot:
        plt.figure(figsize=(10, 10))
        plt.plot(X.index, X, label='original')
        plt.plot(X.index.values[peaks[0]+smoothing_window-1],\
            X.values[peaks[0]+smoothing_window-1], 'x'
            )
        plt.title("Outlier Finders")
        plt.legend()
        plt.show()
    ## Change the outliers with corresponding smoothed values
    X_smoothed = X.copy()

    for i in range(len(X)):
        if np.any(peaks[0] + smoothing_window - 1 == i):
            X_smoothed[i] = X_rolling_mean[i]

    if plot:
        plt.figure(figsize=(10, 10))
        plt.plot(X.index, X, label='original')
        plt.plot(X.index, X_smoothed, label='smoothed')
        plt.title("Original vs. smoothed")
        plt.legend()
        plt.show()

    return X_smoothed
Example #18
0
    def remove_outliers(df: pd.DataFrame) -> pd.DataFrame:
        # identify and remove outliers from dataframe
        iso = IsolationForest(contamination=0.05, random_state=RANDOM_STATE)
        predict = iso.fit_predict(df.iloc[:, 0:-1])

        mask = predict != -1
        return df.iloc[mask]
Example #19
0
    def if_model_v2(self, col, n_estimators):
        x = self.prepare_date(col)
        shape = self.data.shape[0]
        target_mapper = x.shape[0] / shape
        cont = np.where(
            target_mapper < 0.1, 0.1,
            np.where(target_mapper < 0.5, 0.07,
                     np.where(target_mapper < 0.7, 0.06, 0.05)))

        clf = IsolationForest(n_estimators=n_estimators,
                              n_jobs=-1,
                              contamination=cont,
                              max_samples=0.8)
        x = x.iloc[:, :1]
        x['t'] = np.array(range(x.shape[0])) + 1
        print(x.shape)

        clf.fit(x)
        predicted = clf.fit_predict(x)

        x['outlier'] = np.where(predicted == -1, True, False)
        x.reset_index(inplace=True)
        x['t'] = x.reset_index().iloc[:, 0]
        x.rename(columns={col: 'value'}, inplace=True)
        x['Series'] = col
        print('if', cont)

        return x
Example #20
0
 def detect_outliers(self, features: pd.DataFrame) -> pd.DataFrame:
     """
     Find outliers in :param features for bt_scooter:
     :param features: features before outliers removal
     :return: features after removing outliers
     """
     # identify outliers in the training dataset of bt scooter
     print(
         '   ' +
         'Find outliers in :param features ( based on :param features_columns) for bt_scooter'
     )
     stat_list = []
     # Iteratively identify outliers.
     # The code removes 50% of the data from the remaining records during each iteration.
     for i in [1, 2, 3]:
         print(i)
         X_train = features[self.features_columns]
         stat_list.append(list(X_train.mean()))
         stat_list.append(list(X_train.std()))
         iso = IsolationForest(contamination=0.5)
         features['isoutlayer'] = iso.fit_predict(X_train)
         features = features[features['isoutlayer'] == 1]
         features = features.drop(columns='isoutlayer', axis=1)
     pd.DataFrame(
         stat_list,
         columns=self.features_columns).to_csv(self.output_folder +
                                               '\stat_outliers.csv')
     return features
Example #21
0
    def detectWithIsolationForest(self):
        '''
        Apply the Isolation Forest.
        '''
        # Find Model Hyperparameters
        hpMap = self.config['AnomalyDetector'][
            'IsolationForestHyperparameters']

        # Get the Thresholded Response
        yStar, idxList = self.removeCommonData(hpMap['spreadStatistic'],
                                               hpMap['threshold'])
        yStar = [[elem] for elem in yStar]

        # Instantiate the Local Outlier Factor
        ISO = IsolationForest(n_estimators=hpMap['numEstimators'],
                              bootstrap=hpMap['bootstrap'])

        # Fit and Predict with the Local Outlier Factor
        predictions = ISO.fit_predict(yStar)
        scores = ISO.decision_function(yStar)

        # Report the Lon/Lat Points Corresponding to the Anomalies
        # in the Order of Decreasing Anomaly Score (i.e., the Most
        # Anomalous Points are Shown First)
        anomalyIdxList = [
            idxList[i] for i in range(len(yStar)) if predictions[i] == -1
        ]
        anomalyLonLatMap = {scores[i]: (self.M['longitude'][idx], self.M['latitude'][idx]) \
                            for idx in anomalyIdxList}
        sortedScores = sorted(scores)
        anomaliesLonLatSorted = [anomalyLonLatMap[sortedScores[i]] for i in range(len(sortedScores)) \
                                 if sortedScores[i] in anomalyLonLatMap]
        return anomaliesLonLatSorted
Example #22
0
def workIso(modelDir, inputDict):

	samples = float(inputDict['samples'])
	estimators = int(inputDict['estimators'])
	contamination = float(inputDict['contaminationIso'])
	if contamination == 0:
		contamination = 'auto'
	clf =  IsolationForest(max_samples=samples,	n_estimators=estimators, 
		contamination=contamination, behaviour='new', random_state=42)

	# load our csv to df
	f = StringIO(inputDict["file"])
	df = pd.read_csv(f)
	datapoints = df.to_numpy()

	maxVals = np.max(datapoints, axis=0)
	maxVals = np.tile( maxVals, (datapoints.shape[0],1) )
	normalizedDatapoints = np.divide(datapoints,maxVals)

	labels = clf.fit_predict(normalizedDatapoints)
	scores = clf.score_samples(normalizedDatapoints)

	plotData = []
	x = np.arange(0,datapoints.shape[0])
	data = go.Scatter( x=x, y=datapoints[:,0], name='data', mode='lines+markers' ) 
	plotData.append(data)
	outliers = go.Scatter( x=x[labels!=1], y=datapoints[labels!=1, 0], name='outliers',
		mode='markers' ) 
	plotData.append(outliers)

	return plotData
 def remove_outliers(self):
     clf = IsolationForest(n_estimators=20,
                           contamination=0.1,
                           behaviour='new')
     pred = clf.fit_predict(self.df.loc[:, self.xcols + self.ycols])
     self.df = self.df[pred != -1]
     self.df = self.df.reset_index(drop=True)
Example #24
0
def check_outlier_iforest(
        x: pd.DataFrame,
        cols=[
            'col1',
            'col2',
            'col3',
            'col4',
        ]) -> pd.DataFrame:
    """
    Check outliers with isolation forest not including null.
    """
    from sklearn.ensemble import IsolationForest

    intersection_cols = list(x.columns.intersection(cols))
    df = pd.DataFrame(columns=['outlier_percent'])
    for col in intersection_cols:
        temp = x[[col]]
        temp = temp.dropna()
        clf = IsolationForest()
        preds = clf.fit_predict(temp[col].values.reshape(-1, 1))
        percent = len(preds[preds < 0]) / len(preds)
        df.loc[col, 'outlier_percent'] = percent

    if len(df) == 0:
        print('Outlier warnings: There are no columns.')

    return df.sort_values(ascending=False, by=['outlier_percent'])
Example #25
0
def IFdrop1(df):
    IForest = IsolationForest(random_state=0,
                              n_jobs=-1,
                              verbose=0,
                              contamination=0.03)
    y_pred = IForest.fit_predict(df.values.reshape(-1, 2))
    return pd.DataFrame(y_pred)
Example #26
0
def get_outliers_isolation_forest(
    df,
    n_estimators=100,
    contamination="auto",
    n_jobs=-1,
):
    print(f"\nGet outliers with Isolation Forest...")
    # Identify outliers in the training dataset
    iso = IsolationForest(
        n_estimators=n_estimators,
        contamination=contamination,
        bootstrap=True,
        n_jobs=n_jobs,
        verbose=1,
        random_state=rnd_state,
    )
    # yhat = iso.fit_predict(df_X_train_processed)
    yhat = iso.fit_predict(df)
    print(f"It was found out {sum(yhat==-1)} outliers.")

    # Get mask for all TRAIN rows that are not outliers
    outliers_mask_train_iforest = yhat != -1
    joblib.dump(
        outliers_mask_train_iforest,
        data_processed_dir / f"outliers_mask_train_iforest.joblib",
    )
    print(f"Outliers by Isolation Forest saved {data_processed_dir}.")

    return outliers_mask_train_iforest
def main():
    '''
    The procedure contains two simple steps:
        - Scale the data to the standard distribution with mean 0 and unit variance.
          This might be too simplistic.
        - Apply the isolation forest.  The contamination level is set manually.
    '''
    domains = []
    raw = []

    with open(sys.argv[1]) as fhandle:
        for line in fhandle:
            record = json.loads(line.strip())

            for analyser in record['analysers']:
                if analyser['analyser'] == 'FeaturesGenerator':
                    raw.extend(analyser['output'])

                if analyser['analyser'] == 'WordSegmentation':
                    domains.extend(analyser['output'].keys())

            if len(raw) != len(domains):
                print(record)
                sys.exit(0)

    x_samples = scale(np.array(raw))

    engine = IsolationForest(behaviour='new', contamination=0.015)
    y_samples = engine.fit_predict(x_samples)

    for index, y_sample in enumerate(y_samples):
        if y_sample == -1:
            print(domains[index])
Example #28
0
def outlierElim(ids, data, cont=0.05):
    od = IsolationForest(contamination=cont, behaviour="new")
    outlierIds = []
    for x in data:
        darr = data[x]
        f_outliers = od.fit_predict(darr)
        drop_o = np.nonzero(np.where(f_outliers == -1, 1, 0))[0]
        outlierIds.append(ids[drop_o])
    common = np.hstack(outlierIds)
    u, count_o = np.unique(common, return_counts=True)
    outlier = u[count_o > 3]
    print(outlier)
    _, _, outlier_ind = np.intersect1d(outlier, ids, return_indices=True)
    np.savetxt(output_fld + 'ids_outlier.csv',
               ids[outlier_ind],
               delimiter=",",
               fmt='%12.5f')
    ids = np.delete(ids, outlier_ind)
    np.savetxt(output_fld + 'ids_outlierDropped.csv',
               ids,
               delimiter=",",
               fmt='%12.5f')
    for x in data:
        data[x] = np.delete(data[x], outlier_ind, axis=0)
        np.savetxt(output_fld + x + '_outlierDropped.csv',
                   data[x],
                   delimiter=",",
                   fmt='%12.5f')
    return ids, data
Example #29
0
def remove_outliers(model_name, X, y, **add_params):
    """
    For given X and y, removes detected outliers using either Isolation Forest or Local Outlier Factor.

    :param model_name: str - 'isf' (for Isolation Forest), 'lof' (for Local Outlier Factor)
    :param X: numpy array
    :param y: numpy array
    :param add_params: additional_params for Isolation Forest / Local Outlier Factor models
    :return: X, y with removed outliers
    """
    model_name = model_name.lower()

    if model_name == 'isf':
        # n_estimators=150, max_samples=0.8, max_features=0.8, contamination="auto"
        clf = IsolationForest(behaviour='new',
                              random_state=RANDOM_STATE,
                              n_jobs=-1,
                              **add_params)
    elif model_name == 'lof':
        clf = LocalOutlierFactor(n_jobs=-1, **add_params)
    else:
        raise Exception("Choose one of predefined models ('isf' or 'lof')")

    results = clf.fit_predict(X)

    outliers = len(list(filter(lambda x: x == -1, results)))
    print("Isolation forest found {} outliers".format(outliers))

    removing_indices = [i for i in range(0, len(results)) if results[i] == -1]
    X_new = np.delete(X, removing_indices, axis=0)
    y_new = [y[yi] for yi in range(0, len(y)) if results[yi] == 1]

    return X_new, y_new
Example #30
0
def detect_outliers(train_data):
    outliers_list = []
    for i in range(1, 1000):
        clf = IsolationForest(contamination='auto', behaviour='new')
        outliers_predict = clf.fit_predict(train_data)
        print("------------------- Isolation Forest ", i)
        outliers = 0
        outliers_id = []
        for i in range(1, len(outliers_predict)):
            o = outliers_predict[i]
            if o == -1:
                outliers += 1
                outliers_id.append(i)
        outliers_list.append(outliers_id)

    my_dict = {}
    results = []
    for i in range(0, x_train.shape[0]):
        my_dict[i] = 0
    for l in outliers_list:
        for i in l:
            my_dict[i] += 1

    my_dict_s = sorted(my_dict.items(), key=lambda kv: kv[1])
    print(my_dict_s)

    for i in my_dict.keys():
        if my_dict[i] > 900:
            results.append(i)

    #print_histogram(d)
    return results
def outlier_removal(df, col, method, params):
    if method == 'Isolation Forest':
        do_outlier_removal = IsolationForest(**params)
    if method == 'Local Outlier Factor':
        do_outlier_removal = LocalOutlierFactor(**params)
    else:
        method == None
    do_outlier_removal.fit(np.array(df[col]))
    if method == 'Isolation Forest':
        outlier_scores = do_outlier_removal.decision_function(np.array(df[col]))
        df[('meta', 'Outlier Scores - ' + method + str(params))] = outlier_scores
        is_outlier = do_outlier_removal.predict(np.array(df[col]))
        df[('meta', 'Outliers - ' + method + str(params))] = is_outlier
    if method == 'Local Outlier Factor':
        is_outlier = do_outlier_removal.fit_predict(np.array(df[col]))
        df[('meta', 'Outliers - ' + method + str(params))] = is_outlier
        df[('meta', 'Outlier Factor - ' + method + str(params))] = do_outlier_removal.negative_outlier_factor_
    return df, do_outlier_removal