Ejemplo n.º 1
0
def iforest(X_train, X_test, Y_train, Y_test):
    from pyod.models.iforest import IForest
    model = IForest(random_state=0)
    model.fit(X_train)
    pred = model.predict(X_test)
    acc = np.sum(pred == Y_test) / X_test.shape[0]
    print(acc)
    return (acc * 100)
Ejemplo n.º 2
0
def outlier_detection(x_raw, y_raw):
    """
    Filter all ourlier points
    :param x_raw: feature in ndarray
    :param y_raw: label in ndarray
    :return x_clean, y_clean: cleaned feature and label in ndarray
    """
    # TODO Filter the outliers.
    print()
    print("Detecting outliers...")
    print("Before outlier detection: {}".format(x_raw.shape))
    outliers_fraction = 0.04
    random_state = np.random.RandomState(42)
    # all outlier detection method candidate list as follows
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state),
        'Improving Supervised Outlier Detection with Unsupervised Representation Learning':
        XGBOD(contamination=outliers_fraction),
    }
    clf_name = 'Isolation Forest'
    clf = IForest(contamination=outliers_fraction, random_state=random_state)
    # clf_name = 'Angle-based Outlier Detector (ABOD)'
    # clf = ABOD(contamination=outliers_fraction, method='default')
    clf.fit(x_raw)
    y_pred = clf.predict(x_raw)
    # for pyod, 1 means outliers and 0 means inliers
    # for sklearn,  -1 means outliers and 1 means inliers
    idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1]
    x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0)
    y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0)
    print("After outlier detection: {}".format(x_clean.shape))
    assert (x_clean.shape[0] == y_clean.shape[0])
    return x_clean, y_clean
Ejemplo n.º 3
0
def detect_outliers(stocks: list, all_stocks_cip: pd.DataFrame, rules=None):
    """
    Returns a dataframe describing those outliers present in stocks based on the provided rules.
    """
    if rules is None:
        rules = default_point_score_rules()
    str_rules = { str(r):r for r in rules }
    rows = []
    stocks_by_sector_df = stocks_by_sector() # NB: ETFs in watchlist will have no sector
    stocks_by_sector_df.index = stocks_by_sector_df['asx_code']
    for stock in stocks:
        #print("Processing stock: ", stock)
        try:
           sector = stocks_by_sector_df.at[stock, 'sector_name']
           sector_companies = list(stocks_by_sector_df.loc[stocks_by_sector_df['sector_name'] == sector].asx_code)
           # day_low_high() may raise KeyError when data is currently being fetched, so it appears here...
           day_low_high_df = day_low_high(stock, all_stocks_cip.columns)
        except KeyError:
           warning(None, "Unable to locate watchlist entry: {} - continuing without it".format(stock))
           continue
        state = {
            'day_low_high_df': day_low_high_df,  # never changes each day, so we init it here
            'all_stocks_change_in_percent_df': all_stocks_cip,
            'stock': stock,
            'daily_range_threshold': 0.20, # 20% at either end of the daily range gets a point
        }
        points_by_rule = defaultdict(int)
        for date in all_stocks_cip.columns:
            market_avg = all_stocks_cip[date].mean()
            sector_avg = all_stocks_cip[date].filter(items=sector_companies).mean()
            stock_move = all_stocks_cip.at[stock, date]
            state.update({ 'market_avg': market_avg, 'sector_avg': sector_avg,
                           'stock_move': stock_move, 'date': date })
            for rule_name, rule in str_rules.items():
                points_by_rule[rule_name] += rule(state)
        d = { 'stock': stock }
        d.update(points_by_rule)
        rows.append(d)
    df = pd.DataFrame.from_records(rows)
    df = df.set_index('stock')
    print(df)
    from pyod.models.iforest import IForest
    clf = IForest()
    clf.fit(df)
    scores = clf.predict(df)
    results = [row[0] for row, value in zip(df.iterrows(), scores) if value > 0]
    #print(results)
    print("Found {} outlier stocks".format(len(results)))
    return results
Ejemplo n.º 4
0
    def transform(self, df2: pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe."""

        le = LabelEncoder()
        df2['mm'] = df2['make'] + ' ' + df2['model']
        g_mm_count = df2.groupby(['mm']).count().reset_index()
        mm_more_than_100 = g_mm_count[g_mm_count['make'] > 100]['mm']
        df2 = df2[df2['mm'].isin(mm_more_than_100)]
        dfn3 = df2.copy()
        g1 = dfn3.groupby('mm')
        clf1 = IForest(contamination=0.01)
        flag = [1]

        if 1 in flag:

            dff1 = pd.DataFrame(columns=[
                'idv_id', 'kms_run', 'owners', 'age', 'Popularity Index',
                'quoted_price', 'outlier', 'dep_percentage'
            ])

            for idv_id, idv_id_df in g1:
                idv_id_df1 = idv_id_df[[
                    'kms_run', 'owners', 'age', 'quoted_price',
                    'dep_percentage'
                ]]
                clf1.fit(idv_id_df1)
                y_pred = clf1.predict(idv_id_df1)
                idv_id_df['outlier'] = y_pred.tolist()
                dff1 = pd.concat([dff1, idv_id_df])
            outlier_idv_if_dff1 = set(dff1[dff1['outlier'] == 1].index)

        df2 = df2.drop(outlier_idv_if_dff1)
        df = df2.copy()
        X = df[[
            'make', 'model', 'city', 'variant', 'owners', 'kms_run', 'age',
            'Popularity Index', 'ex_showroom_price', 'fuel_type',
            'transmission', 'color'
        ]]
        categorical_feature_mask = X.dtypes == object
        categorical_cols = X.columns[categorical_feature_mask].tolist()
        self.dic = {}
        for i in categorical_cols:
            X[i] = le.fit_transform(X[i])
            self.dic[i] = dict(zip(le.classes_, le.transform(le.classes_)))
        y = df[['dep_percentage']]
        aa = pd.concat([X, y], axis=1)

        return aa
Ejemplo n.º 5
0
def add_other_class(num, size, pad):
    res = pd.read_csv("data/train.txt", header=None).values
    tif_data = []
    for r in tqdm(range(res.shape[0])):
        img = get_cell(res[r][1], res[r][2], size)
        if img is None:
            print("img NOT Exist.", res[r])
            continue
        img = img.reshape(-1).tolist()
        tif_data.append([labels_key[res[r][0]]] + img)
    tif_data = np.array(tif_data)
    print(tif_data.shape)

    np.random.shuffle(tif_data)
    clf = IForest()
    clf.fit(tif_data[:, 1:])

    i = 0
    pos = []
    false_num = 0
    while True:
        ix = np.random.randint(pad, dataset.RasterXSize - pad)
        iy = np.random.randint(pad, dataset.RasterYSize - pad)
        t = get_cell(ix, iy, size)
        if t is None:
            continue
        t = t.reshape(1, -1)
        y_test_pred = clf.predict(t)[0]  # outlier labels (0 or 1)
        if y_test_pred == 1:
            i += 1
            pos.append(["其他"] + [ix, iy])
            print("{}/{} added.".format(i, num))
        else:
            false_num += 1
            print("{}/{} is not include {}.{}. false_num: {}".format(
                i, num, ix, iy, false_num))

        if i == num:
            break
    pos = np.concatenate((res, np.array(pos)), axis=0)
    print(Counter(pos[:, 0]))

    pd.DataFrame(pos).to_csv("data/train_enhance.txt", index=None, header=None)

    pos[:, 2] = -1 * (pos[:, 2].astype(np.int))
    pd.DataFrame(pos).to_csv("data/train_enhance_view.txt",
                             index=None,
                             header=None)
Ejemplo n.º 6
0
class IForestWrapper:
    def __init__(self, **kwargs):
        self._model = IForest(**kwargs)

    def fit(self, X, T):
        # unsupervised learning Targets not used
        self._model.fit(X)
        return self

    def predict(self, X):
        Y = self._model.predict(X)
        return Y

    def predict_proba(self, X):
        probs = self._model.predict_proba(X)
        return probs
Ejemplo n.º 7
0
def main():
    dataset, label = pre_data()
    from numpy import nan as NA
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(missing_values=NA, strategy="mean")
    dataset = imputer.fit_transform(dataset)
    x_train, x_test, y_train, y_label = train_test_split(dataset,
                                                         label,
                                                         test_size=0.3,
                                                         random_state=44)
    # x_train, x_test, y_train, y_label =[], [], [], []
    # for i in range(1000):
    #     x_train.append(dataset[i])
    #     y_train.append(label[i])
    # for i in range(6000,10000):
    #     x_train.append(dataset[i])
    #     y_train.append(label[i])
    # x_test = dataset[1000:6000]
    # y_label = label[1000:6000]
    for i in range(3):
        clf_name = 'IForest'
        clf = IForest()
        clf.fit(x_train)

        # get the prediction label and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores
        from sklearn.metrics import accuracy_score
        from sklearn.metrics import precision_score
        from sklearn.metrics import recall_score
        print(accuracy_score(y_train, y_train_pred))
        print(precision_score(y_train, y_train_pred))
        print(recall_score(y_train, y_train_pred))
        # get the prediction on the test data
        y_test_pred = clf.predict(x_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(x_test)  # outlier scores

        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print(accuracy_score(y_label, y_test_pred))
        print(precision_score(y_train, y_train_pred))
        print(recall_score(y_train, y_train_pred))
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_label, y_test_scores)
Ejemplo n.º 8
0
    def get_IF_scores(dataframe,
                      cols,
                      outliers_fraction=0.01,
                      standardize=True):
        '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default
        
        Returns:
            df with Isolation Forest (IF) scores added         
        '''
        if standardize:
            #standardize selected variables
            minmax = MinMaxScaler(feature_range=(0, 1))
            dataframe[cols] = minmax.fit_transform(dataframe[cols])

        #Convert dataframe to a numpy array in order to incorprate our algorithm
        arrays = []
        for row in cols:
            row = dataframe[row].values.reshape(-1, 1)
            arrays.append(row)
        X = np.concatenate((arrays), axis=1)

        #fit
        clf = IForest(contamination=outliers_fraction, random_state=0)
        clf.fit(X)

        # predict raw anomaly score
        scores_pred = clf.decision_function(X) * -1

        # prediction of a datapoint category outlier or inlier
        y_pred = clf.predict(X)
        n_inliers = len(y_pred) - np.count_nonzero(y_pred)
        n_outliers = np.count_nonzero(y_pred == 1)

        CheckOutliers.df3 = dataframe
        CheckOutliers.df3['outlier'] = y_pred.tolist()

        print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers,
              'found with HBOS')
Ejemplo n.º 9
0
    n_test = 100  # number of testing points

    X_train, y_train, X_test, y_test = generate_data(
        n_train=n_train, n_test=n_test, contamination=contamination)

    # train IForest detector
    clf_name = 'IForest'
    clf = IForest()
    clf.fit(X_train)

    # get the prediction label and decision_scores_ on the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

    # visualize the results
    visualize(clf_name,
              X_train,
              y_train,
              X_test,
              y_test,
              y_train_pred,
Ejemplo n.º 10
0
    #print(data['s'])

    #划分测试集和训练集
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

    #使用pyod中的IForest算法拟合数据
    clf_name = 'IForest'
    clf = IForest()
    clf.fit(X_train)

    #预测得到由0和1组成的数组,1表示离群点,0表示飞离群点
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores,The outlier scores of the training data.

    #预测样本是不是离群点,返回0和1 的数组
    y_test_pred = clf.predict(X_test)

    y_test_scores = clf.decision_function(
        X_test)  # outlier scores,The anomaly score of the input samples.
    #使用sklearn中的roc_auc_score方法得到auc值,即roc曲线下面的面积
    try:
        sumAuc_train += sklearn.metrics.roc_auc_score(y_train,
                                                      y_train_scores,
                                                      average='macro')
        sumAuc_test += sklearn.metrics.roc_auc_score(y_test,
                                                     y_test_scores,
                                                     average='macro')
        #s=precision_score(y_train, y_train_scores, average='macro')
        i += 1
        print(sumAuc_train, sumAuc_test)
    except ValueError:
Ejemplo n.º 11
0
class TestIForest(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = IForest(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_') and
                self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and
                self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_') and
                self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and
                self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and
                self.clf._sigma is not None)
        assert (hasattr(self.clf, 'estimators_') and
                self.clf.estimators_ is not None)
        assert (hasattr(self.clf, 'estimators_samples_') and
                self.clf.estimators_samples_ is not None)
        assert (hasattr(self.clf, 'max_samples_') and
                self.clf.max_samples_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def test_model_clone(self):
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
Ejemplo n.º 12
0
class TestIForest(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = IForest(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)
        assert_true(hasattr(self.clf, 'estimators_') and
                    self.clf.estimators_ is not None)
        assert_true(hasattr(self.clf, 'estimators_samples_') and
                    self.clf.estimators_samples_ is not None)
        assert_true(hasattr(self.clf, 'max_samples_') and
                    self.clf.max_samples_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
Ejemplo n.º 13
0
#############################################
# And the conversion.

if IForest is not None:
    onx = to_onnx(model1, initial_types=initial_type,
                  target_opset=14)

###############################################
# Checking discrepencies
# ++++++++++++++++++++++

if IForest is not None:
    data = sc_data.astype(np.float32)

    expected_labels = model1.predict(data)
    expected_proba = model1.predict_proba(data)

    sess = InferenceSession(onx.SerializeToString())
    res = sess.run(None, {'float_input': data})

    onx_labels = res[0]
    onx_proba = res[1]

    diff_labels = np.abs(onx_labels.ravel() - expected_labels.ravel()).max()
    diff_proba = np.abs(onx_proba.ravel() - expected_proba.ravel()).max()

    print("dicrepencies:", diff_labels, diff_proba)

    print("ONNX labels", onx_labels)
    print("ONNX probabilities", onx_proba)
Ejemplo n.º 14
0
def train_model(request):
    global clf
    if request.method == 'POST':
        try:
            json_data = json.loads(request.body)
            print(json_data)
            file = json_data['file']
            data = pd.read_csv(file)
            data = data.fillna(0)
            s = data["Birth year"]
            s[s != 0]
            data["Birth year"] = s[s != 0].str.replace("/", "").astype(int)
            data = data.fillna(0)

            data['Birth year'].apply(type)
            data['Uid'] = data['Uid'].astype(str).str.replace(' ',
                                                              '').astype(float)

            s = data['Uid']

            X1 = data['Birth year'].values.reshape(-1, 1)
            X2 = data['Uid'].values.reshape(-1, 1)

            X = np.concatenate((X1, X2), axis=1)
            outliers_fraction = 0.01
            outliers_fraction = 0.01
            xx, yy = np.meshgrid(np.linspace(0, 1, 100),
                                 np.linspace(0, 1, 100))
            clf = IForest(contamination=outliers_fraction, random_state=0)
            clf.fit(X)
            # predict raw anomaly score
            scores_pred = clf.decision_function(X) * -1

            # prediction of a datapoint category outlier or inlier
            y_pred = clf.predict(X)
            n_inliers = len(y_pred) - np.count_nonzero(y_pred)
            n_outliers = np.count_nonzero(y_pred == 1)
            plt.figure(figsize=(8, 8))
            # copy ofa dataframe
            data1 = data
            data['outlier'] = y_pred.tolist()

            # sales - inlier feature 1,  profit - inlier feature 2
            inliers_Uid = np.array(data['Uid'][data['outlier'] == 0]).reshape(
                -1, 1)
            inliers_Birth_year = np.array(
                data['Birth year'][data['outlier'] == 0]).reshape(-1, 1)

            # sales - outlier feature 1, profit - outlier feature 2
            outliers_Uid = data1['Uid'][data1['outlier'] == 1].values.reshape(
                -1, 1)
            outliers_Birth_year = data1['Birth year'][data1['outlier'] ==
                                                      1].values.reshape(-1, 1)

            print('OUTLIERS: ', n_outliers, 'INLIERS: ', n_inliers)

            output = {'OUTLIERS ': n_outliers, 'INLIERS ': n_inliers}

            return JsonResponse(output)
        except Exception:
            return JsonResponse(Exception, safe=False)
Ejemplo n.º 15
0
    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train IForest detector
    clf_name = 'IForest'
    clf = IForest()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

    # visualize the results
    visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
              y_test_pred, show_figure=True, save_figure=False)
Ejemplo n.º 16
0
class TestIForest(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination)

        self.clf = IForest(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        if not hasattr(
                self.clf,
                'decision_scores_') or self.clf.decision_scores_ is None:
            self.assertRaises(AttributeError, 'decision_scores_ is not set')
        if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None:
            self.assertRaises(AttributeError, 'labels_ is not set')
        if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None:
            self.assertRaises(AttributeError, 'threshold_ is not set')
        if not hasattr(self.clf,
                       'estimators_') or self.clf.estimators_ is None:
            self.assertRaises(AttributeError, 'estimators_ is not set')
        if not hasattr(
                self.clf,
                'estimators_samples_') or self.clf.estimators_samples_ is None:
            self.assertRaises(AttributeError, 'estimators_samples_ is not set')
        if not hasattr(self.clf,
                       'max_samples_') or self.clf.max_samples_ is None:
            self.assertRaises(AttributeError, 'max_samples_ is not set')

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_evaluate(self):
        self.clf.fit_predict_evaluate(self.X_test, self.y_test)

    def tearDown(self):
        pass
Ejemplo n.º 17
0
def detect_outliers(stocks: list, all_stocks_cip: pd.DataFrame, rules=None):
    """
    Returns a dataframe describing those outliers present in stocks based on the provided rules.
    All_stocks_cip is the "change in percent" for at least the stocks present in the specified list
    """
    if rules is None:
        rules = default_point_score_rules()
    str_rules = {str(r): r for r in rules}
    rows = []
    stocks_by_sector_df = (stocks_by_sector()
                           )  # NB: ETFs in watchlist will have no sector
    stocks_by_sector_df.index = stocks_by_sector_df["asx_code"]
    for stock in stocks:
        # print("Processing stock: ", stock)
        try:
            sector = stocks_by_sector_df.at[stock, "sector_name"]
            sector_companies = list(stocks_by_sector_df.loc[
                stocks_by_sector_df["sector_name"] == sector].asx_code)
            # day_low_high() may raise KeyError when data is currently being fetched, so it appears here...
            day_low_high_df = day_low_high(stock, all_stocks_cip.columns)
        except KeyError:
            warning(
                None,
                "Unable to locate watchlist entry: {} - continuing without it".
                format(stock),
            )
            continue
        state = {
            "day_low_high_df":
            day_low_high_df,  # never changes each day, so we init it here
            "all_stocks_change_in_percent_df": all_stocks_cip,
            "stock": stock,
            "daily_range_threshold":
            0.20,  # 20% at either end of the daily range gets a point
        }
        points_by_rule = defaultdict(int)
        for date in all_stocks_cip.columns:
            market_avg = all_stocks_cip[date].mean()
            sector_avg = all_stocks_cip[date].filter(
                items=sector_companies).mean()
            stock_move = all_stocks_cip.at[stock, date]
            state.update({
                "market_avg": market_avg,
                "sector_avg": sector_avg,
                "stock_move": stock_move,
                "date": date,
            })
            for rule_name, rule in str_rules.items():
                try:
                    points_by_rule[rule_name] += rule(state)
                except TypeError:  # handle nan's in dataset safely
                    pass
        d = {"stock": stock}
        d.update(points_by_rule)
        rows.append(d)
    df = pd.DataFrame.from_records(rows)
    df = df.set_index("stock")
    # print(df)
    clf = IForest()
    clf.fit(df)
    scores = clf.predict(df)
    results = [
        row[0] for row, value in zip(df.iterrows(), scores) if value > 0
    ]
    # print(results)
    print("Found {} outlier stocks".format(len(results)))
    return results
class Remove_Outliers(BaseEstimator, TransformerMixin):
    def __init__(self,
                 target,
                 contamination=.20,
                 random_state=42,
                 methods=['knn', 'iso', 'mcd']):

        self.target = target
        self.contamination = contamination
        self.random_state = random_state
        self.methods = methods

    def fit(self, data, y=None):
        return (None)

    def transform(self, data, y=None):
        return (data)

    def fit_transform(self, dataset, y=None):
        data = dataset.copy()

        if 'iso' in self.methods:
            self.iso_forest = IForest(contamination=self.contamination,
                                      random_state=self.random_state,
                                      behaviour='new')
            self.iso_forest.fit(data.drop(self.target, axis=1))
            iso_predict = self.iso_forest.predict(
                data.drop(self.target, axis=1))
            data['iso'] = iso_predict

        if 'knn' in self.methods:
            self.knn_out = KNN(contamination=self.contamination)
            self.knn_out.fit(data.drop(self.target, axis=1))
            knn_predict = self.knn_out.predict(data.drop(self.target, axis=1))
            data['knn'] = knn_predict

        if 'pca' in self.methods:
            self.out_pca = PCA_RO(contamination=self.contamination,
                                  random_state=self.random_state)
            self.out_pca.fit(data.drop(self.target, axis=1))
            pca_predict = self.out_pca.predict(data.drop(self.target, axis=1))
            data['pca'] = pca_predict

        # use for those features which are gaussian distributed
        if 'mcd' in self.methods:
            self.mcd = EllipticEnvelope(contamination=0.01)
            self.mcd.fit(data.drop(self.target, axis=1))
            mcd_predict = self.mcd.predict(data.drop(self.target, axis=1))
            data['mcd'] = mcd_predict

        data['vote_outlier'] = 0

        for i in self.methods:
            data['vote_outlier'] = data['vote_outlier'] + data[i]

        self.outliers = data[data['vote_outlier'] == len(self.methods)]

        return dataset[[
            True if i not in self.outliers.index else False
            for i in dataset.index
        ]]