Example #1
0
def remove_outliers_knn(
        x: pd.DataFrame,
        y: np.array,
        contamination: float = 0.1) -> Tuple[pd.DataFrame, np.array]:
    """Remove outliers from the training/test set using PyOD's KNN classifier

    Args:
        x: DataFrame containing the X's
        y: target array
        contamination: the amount of contamination of the data set

    Returns:
        x and y with outliers removed
    """
    clf = KNN(contamination=contamination, n_jobs=-1)

    clf.fit(x)

    labels = clf.labels_

    print(
        "{0:.2%} among {1:,} sample points are identified and removed as outliers"
        .format(sum(labels) / x.shape[0], x.shape[0]))

    x = x.iloc[labels == 0]
    y = y[labels == 0]

    return x, y
def train():
    dataset = get_data(1000, 10, 100)
    contamination = 0.01
    with mlflow.start_run():
        base_estimators = [
            LOF(n_neighbors=5, contamination=contamination),
            LOF(n_neighbors=15, contamination=contamination),
            LOF(n_neighbors=25, contamination=contamination),
            PCA(contamination=contamination),
            KNN(n_neighbors=5, contamination=contamination),
            KNN(n_neighbors=15, contamination=contamination),
            KNN(n_neighbors=25, contamination=contamination)]
        model = SUOD(base_estimators=base_estimators, n_jobs=6,  
                    rp_flag_global=True,  
                    bps_flag=True,  
                    approx_flag_global=False, 
                    contamination=contamination)
        model.fit(dataset)  
        model.approximate(dataset)  
        predicted_labels = model.predict(dataset)
        voted_labels = vote(predicted_labels)
        true_labels = [0]*1000 + [1]*10
        auc_score = roc_auc_score(voted_labels, true_labels)
        print("The resulted area under the ROC curve score is {}".format(auc_score))
        mlflow.log_metric("auc_score", auc_score)
        mlflow.sklearn.log_model(model, "anomaly_model", conda_env="conda.yaml")
Example #3
0
    def get_all_readings_from_person(self,
                                     person_tag,
                                     remove_outliers=0,
                                     additional_where=""):
        #Debug.print_debug(self.file_path)
        print(self.file_path)
        dataset = sqlite3.connect(self.file_path)
        if len(additional_where) > 0:
            to_return = self.get_data_sql_query(
                "select {} from {} where {} like {} {}".format(
                    ', '.join(self.features), self.table_name,
                    self.person_column, person_tag, additional_where), dataset)
        else:
            to_return = self.get_data_sql_query(
                "select {} from {} where {} like '{}'".format(
                    ', '.join(self.features), self.table_name,
                    self.person_column, person_tag), dataset)
        self.data = to_return
        if (remove_outliers > 0):
            knn = KNN(contamination=remove_outliers)
            to_return_aux = to_return.copy()
            to_return_aux = to_return_aux.drop(self.label_tag, 1)
            knn.fit(to_return_aux)
            pred = knn.predict(to_return_aux)
            to_return = to_return.iloc[np.where(pred == 0)[0], :]

        return to_return
Example #4
0
def run_KNN_base_detector(data, k, metric='euclidean', p=2, method='mean'):
    """
    Function to fit and predict the KNN base detector on `data`.
    
    Input:
     - data: pd.DataFrame, to run KNN on
     - k: integer, parameter to indicate the amount of neighbours to include in relative density determination
     - metric: string, distance metric to use, default `euclidean`
     - p: int, default 2 since metric = `euclidean`, otherwise set according to distance metric
     
    Output:
     - clf of class pyod.models.knn.KNN with all its properties
    """
    
    # Split data in values and targets: some datasets have an ID column, others don't
    try:
        X = data.drop(['outlier', 'id'], axis=1)
    except KeyError:
        X = data.drop('outlier', axis=1)
    
    # Construct and fit classifier
    clf = KNN(n_neighbors=k, metric='euclidean', p=p, method=method)
    clf.fit(X) # Fit only on features
    
    # Add ground truth labels for evaluation of the classifier
    clf.true_labels_ = data['outlier']
    
    # Return the classifier for further processing
    return clf
Example #5
0
  def detectarOutlierKNN(self, idmodelo, Xtodos, corteOutlier):
    # Detecao Outliers 1--------------------------------------------------------------
    clf = KNN()
    clf.fit(Xtodos)

    # get outlier scores
    y_train_scores = clf.decision_scores_  # raw outlier scores
    y_test_scores = clf.decision_function(Xtodos)  # outlier scores

    YCodigoTodosComOutilier = self.selectMatrizY(idmodelo, "ID", "TODOS")

    cont = 0
    amostrasRemovidas = 0

    for itemOutilier in y_train_scores:
      if itemOutilier > corteOutlier:
        contTodos = 0
        for item in YCodigoTodosComOutilier:
          amostra = str(item)
          amostra = amostra.replace("[", "")
          amostra = amostra.replace("]", "")
          if contTodos == cont:
            db.execute(
              " update amostra set tpamostra = 'OUTLIER' where idamostra = " + str(amostra) + " and idmodelo = " + str(
                idmodelo) + "")
            print(itemOutilier)
            amostrasRemovidas = amostrasRemovidas + 1
            break
          contTodos = contTodos + 1
      cont = cont + 1

    session.commit()
    print("Numero de Amostras Removidas: " + str(amostrasRemovidas))
    return cont
Example #6
0
def training(data, img_shape, re_sample_type, text_len, permission_names,
             extract_f):
    # load training data
    print('preparing training data')
    inputs, permissions = prepare_training_data(data, img_shape,
                                                re_sample_type, text_len,
                                                permission_names)

    # get features
    print('generating training features')
    features = extract_f.predict(inputs)

    # train auto encoder model, knn model
    print('training outlier model + knn model')
    detectors = []
    knn_trees = []
    features_in_permissions = [
    ]  # features in each permission, [permission_id, feature_id]
    for p in permission_names:
        print('training', p, '...')
        features_current = []
        for i in range(len(permissions)):
            if p in permissions[i]:
                features_current.append(features[i])
        features_in_permissions.append(features_current)

        detector = AutoEncoder(epochs=200, verbose=0)
        detector.fit(features_current)
        detectors.append(detector)

        knn = KNN()
        knn.fit(features_current)
        knn_trees.append(knn)

    return detectors, knn_trees, features_in_permissions
Example #7
0
def construct_raw_base_estimators():
    from pyod.models.knn import KNN
    from pyod.models.lof import LOF
    from pyod.models.cblof import CBLOF
    from pyod.models.hbos import HBOS
    from pyod.models.iforest import IForest
    from pyod.models.abod import ABOD
    from pyod.models.ocsvm import OCSVM

    estimator_list = []

    # predefined range of n_neighbors for KNN, AvgKNN, and LOF
    k_range = [3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

    for k in k_range:
        estimator_list.append(
            KNN(n_neighbors=k, method="largest", contamination=0.05))
        estimator_list.append(
            KNN(n_neighbors=k, method="mean", contamination=0.05))
        estimator_list.append(LOF(n_neighbors=k, contamination=0.05))

    # predefined range of nu for one-class svm
    nu_range = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99]
    for nu in nu_range:
        estimator_list.append(OCSVM(nu=nu, contamination=0.05))

    # predefined range for number of estimators in isolation forests
    n_range = [10, 20, 50, 70, 100, 150, 200, 250]
    for n in n_range:
        estimator_list.append(
            IForest(n_estimators=n, random_state=42, contamination=0.05))

    return estimator_list
Example #8
0
class IForestSupervisedKNN(BaseDetector):
    def __init__(self, get_top=0.8, if_params={}, knn_params={}):
        super(IForestSupervisedKNN, self).__init__()
        self.get_top = get_top
        self.is_fitted = False

        self.iforest = IForest(**if_params)

        self.knn = KNN(**knn_params)

    def fit(self, X, y=None):

        X = check_array(X)
        self._set_n_classes(y)

        self.iforest.fit(X)

        scores = self.iforest.predict_proba(X)[:, 1]

        normal_instances = X[np.argsort(scores)[:int(len(X) * self.get_top)]]

        self.knn.fit(normal_instances)

        self.decision_scores_ = self.decision_function(X)
        self._process_decision_scores()

        self.is_fitted = True

        return self

    def decision_function(self, X):

        check_is_fitted(self, ['is_fitted'])

        return self.knn.decision_function(X)
Example #9
0
def load_classifiers(outliers_fraction):
    outliers_fraction = min(0.5, outliers_fraction)
    random_state = np.random.RandomState(42)
    # Define nine outlier detection tools to be compared
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction,
                random_state=random_state,
                behaviour="new"),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Average KNN':
        KNN(method='mean', contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(n_neighbors=35, contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state)
    }
    return classifiers
Example #10
0
    def __load_classifiers(self):
        outliers_fraction = 0.05
        random_state = np.random.RandomState(0)

        classifiers = {
            'Cluster-based Local Outlier Factor (CBLOF)':
            CBLOF(contamination=outliers_fraction,
                  check_estimator=False,
                  random_state=random_state),
            'Feature Bagging':
            FeatureBagging(LOF(n_neighbors=35),
                           contamination=outliers_fraction,
                           random_state=random_state),
            'Histogram-base Outlier Detection (HBOS)':
            HBOS(contamination=outliers_fraction),
            'Isolation Forest':
            IForest(contamination=outliers_fraction,
                    random_state=random_state),
            'K Nearest Neighbors (KNN)':
            KNN(contamination=outliers_fraction),
            'Average KNN':
            KNN(method='mean', contamination=outliers_fraction),
            'Local Outlier Factor (LOF)':
            LOF(n_neighbors=35, contamination=outliers_fraction),
            'Minimum Covariance Determinant (MCD)':
            MCD(contamination=outliers_fraction, random_state=random_state),
            'One-class SVM (OCSVM)':
            OCSVM(contamination=outliers_fraction),
        }

        return classifiers
Example #11
0
    def __init__(self, window_size, step_size=1, contamination=0.1,
                 n_neighbors=5, method='largest',
                 radius=1.0, algorithm='auto', leaf_size=30,
                 metric='minkowski', p=2, metric_params=None, n_jobs=1,
                 **kwargs):
        super(KDiscord, self).__init__(contamination=contamination)
        self.window_size = window_size
        self.step_size = step_size

        # parameters for kNN
        self.n_neighbors = n_neighbors
        self.method = method
        self.radius = radius
        self.algorithm = algorithm
        self.leaf_size = leaf_size
        self.metric = metric
        self.p = p
        self.metric_params = metric_params
        self.n_jobs = n_jobs

        # initialize a kNN model
        self.model_ = KNN(contamination=self.contamination,
                          n_neighbors=self.n_neighbors,
                          radius=self.radius,
                          algorithm=self.algorithm,
                          leaf_size=self.leaf_size,
                          metric=self.metric,
                          p=self.p,
                          metric_params=self.metric_params,
                          n_jobs=self.n_jobs,
                          **kwargs)
    def some_random_test():
        np.set_printoptions(threshold=sys.maxsize)

        X = load_npz("X.npz").toarray()
        Y = genfromtxt('Y.csv', delimiter=',')

        # train kNN detector
        clf_name = 'KNN'
        clf = KNN()

        # find outliers per class
        # print(Y.shape)
        # print(X[Y == 1.].shape)
        # print(X[Y == 0.].shape)
        # print(X[Y == 7.].shape)

        # collect the outliers in a per class manner
        classList = [1.0, 0.0, 7.0]
        y_train_pred_total = []
        for clas in classList:
            clf.fit(X[Y == clas])
            y_train_pred_total.append(clf.labels_)

        # -------------------------RESULT---------------------
        # 0:inlier, 1: outlier
        np.array(y_train_pred_total).tofile('outliers.csv',
                                            sep=',',
                                            format='%10.5f')
Example #13
0
 def distanceBased(self):
     '''
     @brief Function that implements the distance based component
     @param self
     @return It returns the vector with the scores of the instances
     '''
     # Initialize the scores
     scores = np.array([0] * len(self.dataset)).astype(float)
     for i in range(self.num_iter):
         knn = KNN(n_neighbors=5, contamination=self.contamination)
         # Number in the interval [50, 1000]
         subsample_size = np.random.randint(50, 1001)
         sample = []
         if subsample_size >= len(self.dataset):
             sample = list(range(len(self.dataset)))
         else:
             # Take the sample and train the model
             sample = np.random.choice(len(self.dataset),
                                       size=subsample_size,
                                       replace=False)
         knn.fit(self.dataset[sample])
         # Update the score to compute the mean
         scores[sample] += knn.decision_scores_
     # Return the mean
     scores = scores / self.num_iter
     scores = scale(scores)
     return scores
Example #14
0
File: model.py Project: esowc/DAAQS
    def pred_KNN(self, k=5, comp_with="openaq"):
        ## hyperparameters for KNN is tuned here
        # if self.bool_o_dict == True:
        self.comp_with = comp_with

        if comp_with == "openaq":
            if self.X_o == []:
                pred = []
            elif self.X_o.shape[0] > k:
                self.clf = KNN(n_neighbors=k)
                self.clf.fit(self.X_o)
                pred = self.clf.labels_
            elif self.X_o.shape[0] > 2:
                # print(f"The value of k is changed from {k} to {self.X_o.shape[0]-1}")
                k = self.X_o.shape[0] - 1
                self.clf = KNN(n_neighbors=k)
                self.clf.fit(self.X_o)
                pred = self.clf.labels_
            else:
                pred = []
            #A_location, B_location, C_location = self.pred_location(pred)

        elif comp_with == "cams":
            pred = []
            for each_X in self.X_c:
                # if each_X exists then it will have a shape of (10,8)
                self.clf = KNN(n_neighbors=k)
                self.clf.fit(each_X)
                pred.append(self.clf.labels_[-1])

        A_location, B_location, C_location = self.pred_location(pred)

        return A_location, B_location, C_location
Example #15
0
def stop_train(filename):
    """
    Stops training and saves the model as filename.sav
    also saves the threshold, mean and standard deviation
    in a json file of the same name. Also saves the pca model
    """
    pca = PCA(n_components=3)
    pca.fit(np.array(train.arr))
    with open(filename + 'pca.sav', 'wb') as savpca:
        pickle.dump(pca, savpca)
    z = find_theta_score(np.array(train.arr), pca)

    lof = KNN(n_neighbors=1)
    lof.fit(z)
    scores = lof.decision_scores_
    with open(filename + 'knn.sav', 'wb') as savknn:
        pickle.dump(lof, savknn)

    mean = scores.mean()
    stdev = scores.std()
    thres = mean + 18 * stdev
    params = {}
    params['mean'] = mean
    params['std'] = stdev
    params['threshold'] = thres
    with open(filename + '.json', 'w') as jsonf:
        json.dump(params, jsonf)

    print()
    print("Training Completed")
Example #16
0
def removeOutliers(df_flights_list,
                   contamination=0.001,
                   n_neighbors=1000,
                   method='mean'):
    '''Remove Outliers'''

    lf_array = []
    for flights in df_flights_list:
        lf_array.append(flights.lf.values)
    lf_array = np.array(lf_array)

    # Train kNN detector
    outlier_model = KNN(contamination=contamination,
                        n_neighbors=n_neighbors,
                        method=method)
    outlier_model.fit(lf_array)

    # Get the prediction labels
    outliers_labels = outlier_model.labels_  # binary labels (0: inliers, 1: outliers)

    df_flights_list = [
        df_flight for index, df_flight in enumerate(df_flights_list)
        if outliers_labels[index] == 0
    ]

    return df_flights_list
Example #17
0
    def fit(self,df):
        logging.info("Initializaing Pipeline")
        isTraining = True
        self.isTraining = isTraining
        self.adf = df
        self.df = df.copy()

        self.numeric_cols = getNumericColumns(df)
        self.cat_cols = getCategorialColumns(df)
        self.DEPENDENT_VARIABLE = getDependentVariable()

        self.cat_cols_useless =  [ "encounter_id" , "hospital_id" , "patient_id" , "icu_id"]
        self.cat_cols_minus = [c for c in self.cat_cols if c not in ["clusterId","hospital_death", "encounter_id" , "hospital_id" , "patient_id"]]
        self.cat_cols_minus_useless = [c for c in self.cat_cols if c not in ["clusterId", "encounter_id" , "hospital_id" , "patient_id" , "icu_id" ]]
        self.cols_to_dummy = [c for c in self.cat_cols_minus_useless if c != "hospital_death"]

        self.num_mean = SimpleImputer(strategy="median")
        self.cat_freq = SimpleImputer(strategy="most_frequent")
        self.rs = RobustScaler()
        self.pt = PowerTransformer()
        self.ohe = OneHotEncoder(handle_unknown='ignore' , sparse=False)
        self.outlierKNN = KNN()

        self.num_means = [MatrixFactorization() for i in range(4)]
        self.cat_freqs = [SimpleImputer(strategy="most_frequent") for i in range(4)]
        #self.label_encoders = defaultdict(LabelEncoder)
        self.label_encoders = WOEEncoder()
        self.later_num_transform = PowerTransformer()

        self.X = self.df.drop([self.DEPENDENT_VARIABLE] , axis=1)
        self.y = self.df[self.DEPENDENT_VARIABLE]
        return self.GetTransformedData(isTraining)
Example #18
0
def train_knn_anomaly_detector(input_df: pandas.DataFrame, domain: str, train_fields=(),
                               n_neighbors=10, contamination=0.1):
    """
    :param input_df: The input dataframe
    :param domain: The domain (model name)
    :param train_fields: The features (numeric only)
    :param n_neighbors: Number of neighbors to use by default for k neighbors queries.
    :param contamination: The amount of contamination of the data set, i.e. the proportion of outliers in the data set.
                          Used when fitting to define the threshold on the decision function
    :return: A list of predictions with included fields
    """

    feature_group_id = hashlib.md5(str(list(train_fields).sort()).encode()).hexdigest()
    drop_fields = [field for field in input_df.columns if field not in train_fields]
    train_df = input_df.drop(drop_fields, axis=1)
    for column in train_df.columns:
        train_df[column] = train_df[column].fillna(0)
    model_directory = os.path.join(const.DYNAMITE_CONF_ROOT, 'models', 'knn_anomaly_detector', feature_group_id)
    model_pkl_file = os.path.join(model_directory, domain + '.pkl')

    makedirs(model_directory)

    model = KNN(contamination=contamination, n_neighbors=n_neighbors, metric='manhattan')

    joblib.dump(model.fit(train_df), model_pkl_file)
Example #19
0
def api_alert(influxdb_ip, influxdb_port, influxdb_user, influxdb_pwd,
              influxdb_database, influxdb_table, apiid):

    timelimit = 'time > now()-1d'
    # 访问influxdb
    client = InfluxDBClient(influxdb_ip, influxdb_port, influxdb_user,
                            influxdb_pwd, influxdb_database)
    # 获取当前API一天前的数据
    result = client.query('select Average, CallCount, ErrorRate from ' +
                          influxdb_table + ' where ApiId = \'' + apiid +
                          '\' and ' + timelimit + ';')
    # 把resultset格式的数据转换成list格式
    apis_table = list(result.get_points(measurement='apis'))
    # 把要处理的数据存成DataFrame
    df = pd.DataFrame(data=apis_table)
    # 去掉不参与运算的列,取训练集x
    x = df
    x = x.drop("time", axis=1)
    # 数据处理一下,归一化,映射到[0,1]
    x['CallCount'] = (x['CallCount']-x['CallCount'].min()) / \
        (x['CallCount'].max()-x['CallCount'].min())
    x['Average'] = (x['Average']-x['Average'].min()) / \
        (x['Average'].max()-x['Average'].min())
    x['ErrorRate'] = x['ErrorRate'] / 100
    # 取最后十秒的数据点作为测试点
    x_last = x.tail(1)
    #df_last = df.tail(1)
    x = x.drop(x.index[-1])
    df = df.drop(df.index[-1])
    # 转换成numpy格式准备计算
    x = x.values

    # 训练一个kNN检测器
    clf_name = 'kNN'
    clf = KNN()  # 初始化检测器clf
    clf.fit(x)  # 使用X_train训练检测器clf

    # 给df添加一列显示异常分数
    df['score'] = clf.decision_scores_

    # 排序分数
    df = df.sort_values("score", ascending=False)
    #print(df.head(20))

    # 新数据预测
    test_data = x_last
    test_scores = clf.decision_function(test_data)

    if (test_scores > 0.8):
        print('数据点异常程度4,必须报警')
    elif (test_scores > 0.5):
        print('数据点异常程度3,需要报警')
    elif (test_scores > 0.1):
        print('数据点异常程度2,建议报警')
    elif (test_scores > 0.05):
        print('数据点异常程度1,可以报警')
        #这个分级是根据KNN.py的图像分析出来的,0.05以上的很明显是异常点,0.1以上已经出现了离群现象,0.5以上就距离数据点很远了。
        #这个值根据训练用的时间相关,一天的数据0.05比较合适。
    return test_scores
Example #20
0
    def __init__(self, get_top=0.8, if_params={}, knn_params={}):
        super(IForestSupervisedKNN, self).__init__()
        self.get_top = get_top
        self.is_fitted = False

        self.iforest = IForest(**if_params)

        self.knn = KNN(**knn_params)
Example #21
0
def median_knn(X_train, X_test, Y_train, Y_test):
    from pyod.models.knn import KNN
    model = KNN(method='median')
    model.fit(X_train)
    pred = model.predict(X_test)
    acc = np.sum(pred == Y_test) / X_test.shape[0]
    print(acc)
    return (acc * 100)
Example #22
0
 def knnAD(self):
     clf_name = 'KNN'
     clf = KNN()
     clf.fit(self.X)
     # get the prediction labels and outlier scores of the training data
     y_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
     y_scores = clf.decision_scores_  # raw outlier scores
     generateAnomalis(self.data, self.label, y_pred)
def train_monitoring_model(data):
    logger.info("Training a monitoring model")

    X_train, X_test = train_test_split(np.array(data, dtype='float'),
                                       test_size=0.2)
    monitoring_model = KNN(contamination=0.05, n_neighbors=15, p=5)
    monitoring_model.fit(X_train)
    return monitoring_model
Example #24
0
def main():

    scalers = ['no', 'std', 'minmax']
    root = 'Unsupervised_Anamaly_Detection_csv'
    start = 0
    counts = 90
    CPUS = 3
    CPUS_Models = 4
    sklearn_models = [
        'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS',
        'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD',
        'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal',
        'VAE', 'AutoEncoder'
    ]

    models = {
        'BRM': BRM(bootstrap_sample_percent=70),
        'GM': GaussianMixture(),
        'IF': IsolationForest(),
        'OCSVM': OneClassSVM(),
        'EE': EllipticEnvelope(),
        'AvgKNN': KNN(method='mean'),
        'LargestKNN': KNN(method='largest'),
        'MedKNN': KNN(method='median'),
        'PCA': PCA(),
        'COF': COF(),
        'LODA': LODA(),
        'LOF': LOF(),
        'HBOS': HBOS(),
        'MCD': MCD(),
        'AvgBagging': FeatureBagging(combination='average'),
        'MaxBagging': FeatureBagging(combination='max'),
        'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'FactorAnalysis': FactorAnalysis(),
        'KernelDensity': KernelDensity(),
        'COPOD': COPOD(),
        'SOD': SOD(),
        'LSCPwithLODA': LSCP([LODA(), LODA()]),
        'AveLMDD': LMDD(dis_measure='aad'),
        'VarLMDD': LMDD(dis_measure='var'),
        'IqrLMDD': LMDD(dis_measure='iqr'),
        'SoGaal': SO_GAAL(),
        'MoGaal': MO_GAAL(),
        'VAE': VAE(encoder_neurons=[8, 4, 2]),
        'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'OCKRA': m_OCKRA(),
    }

    name = "30_Models"

    Parallel(n_jobs=CPUS) \
        (delayed(runByScaler)
         (root, scaler, models, start, counts,
          other_models=sklearn_models,
          CPUS=CPUS_Models,
          save_name=name)
         for scaler in scalers)
def outliers(base):
    detector = KNN()
    detector.fit(base)
    previsoes = detector.labels_
    outliers = []
    for i in range(len(previsoes)):
        if previsoes[i] == 1:
            outliers.append(i)
    base = base.drop(base.index[outliers])
    return base
Example #26
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.75
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = KNN(contamination=self.contamination, method='median')
Example #27
0
    def S2(self):

        self.S1()
        water_data = self.water_data
        result = self.result

        # 数据预处理及模型训练
        clean_data = water_data[water_data['S1'] == 0]
        Y = pd.DataFrame(index=clean_data.index, columns=['S2'])

        X_train = np.array(clean_data.iloc[:, 1:12])
        name = list(clean_data.iloc[:, 1:12].columns.values)
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)

        clf1 = IForest(contamination=0.05, max_features=11, bootstrap=True)
        clf2 = KNN(contamination=0.05, n_neighbors=100)
        clf3 = HBOS(contamination=0.05, n_bins=10)
        clf4 = PCA(contamination=0.05)

        clf1.fit(X_train)
        clf2.fit(X_train)
        clf3.fit(X_train)
        clf4.fit(X_train)

        Y['S2'] = clf1.labels_ * clf2.labels_ * clf3.labels_ * clf4.labels_
        water_data = pd.concat([water_data, Y], axis=1)
        # water_data.loc[water_data['S2'].isna(),['S2']]=0,将S1中异常的,在S2中标注为0;

        result['统计异常'] = water_data['S2'].values

        # 寻找异常维度
        from sklearn.neighbors import KernelDensity
        clean_data = water_data[water_data['S1'] == 0]
        dens = pd.DataFrame(index=clean_data.index,
                            columns=[
                                'temperature', 'pH', 'EC', 'ORP', 'DO',
                                'turbidity', 'transparency', 'COD', 'P',
                                'NH3N', 'flux'
                            ])

        for i in dens.columns:
            kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(
                clean_data[i].values.reshape(-1, 1))
            dens[i] = np.exp(
                kde.score_samples(clean_data[i].values.reshape(-1, 1)))
        dens = dens.iloc[:, 0:11].rank()
        dens['S2_names'] = dens.idxmin(axis=1)
        water_data = pd.concat([water_data, dens['S2_names']], axis=1)
        self.water_data = water_data
        result['统计异常维度'] = water_data['S2_names'].values

        # 存储模型
        joblib.dump(scaler, "./water_model/S2_scaler")
        joblib.dump(clf1, "./water_model/S2_Iforest")
Example #28
0
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test):
    if method == 'KNN':
        clf = KNN()
    elif method == 'CBLOF':
        clf = CBLOF()
    elif method == 'PCA':
        clf = PCA()
    else:
        clf = IForest()
    clf.fit(x_train)  # 使用x_train训练检测器clf

    # 返回训练数据x_train上的异常标签和异常分值
    y_train_pred = clf.labels_  # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
    y_train_scores = clf.decision_scores_  # 返回训练数据上的异常值 (分值越大越异常)
    print("On train Data:")
    evaluate_print(method, y_train, y_train_scores)

    # 用训练好的clf来预测未知数据中的异常值
    y_test_pred = clf.predict(x_test)  # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
    y_test_scores = clf.decision_function(x_test)  # 返回未知数据上的异常值 (分值越大越异常)
    print("On Test Data:")
    evaluate_print(method, y_test, y_test_scores)

    y_true = column_or_1d(y_test)
    y_pred = column_or_1d(y_test_scores)
    check_consistent_length(y_true, y_pred)

    roc = np.round(roc_auc_score(y_true, y_pred), decimals=4),
    prn = np.round(precision_n_scores(y_true, y_pred), decimals=4)

    total_roc.append(roc)
    total_prn.append(prn)
Example #29
0
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = KNN(contamination=self.contamination)
        self.clf.fit(self.X_train)
Example #30
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.75
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination)

        self.clf = KNN(contamination=self.contamination)
        self.clf.fit(self.X_train)
Example #31
0
class TestKnnMedian(unittest.TestCase):

    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.75
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = KNN(contamination=self.contamination, method='median')

    def test_fit(self):
        self.clf.fit(self.X_train)

    def test_decision_function(self):
        self.clf.fit(self.X_train)
        self.clf.decision_function(self.X_train)
        self.clf.decision_function(self.X_test)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def tearDown(self):
        pass
Example #32
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.75
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = KNN(contamination=self.contamination, method='median')
Example #33
0
if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train kNN detector
    clf_name = 'KNN'
    clf = KNN()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
Example #34
0
    n_clf = 20  # number of base detectors

    # Initialize 20 base detectors for combination
    k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140,
              150, 160, 170, 180, 190, 200]

    train_scores = np.zeros([X_train.shape[0], n_clf])
    test_scores = np.zeros([X_test.shape[0], n_clf])

    print('Combining {n_clf} kNN detectors'.format(n_clf=n_clf))

    for i in range(n_clf):
        k = k_list[i]

        clf = KNN(n_neighbors=k, method='largest')
        clf.fit(X_train_norm)

        train_scores[:, i] = clf.decision_scores_
        test_scores[:, i] = clf.decision_function(X_test_norm)

    # Decision scores have to be normalized before combination
    train_scores_norm, test_scores_norm = standardizer(train_scores,
                                                       test_scores)
    # Combination by average
    y_by_average = average(test_scores_norm)
    evaluate_print('Combination by Average', y_test, y_by_average)

    # Combination by max
    y_by_maximization = maximization(test_scores_norm)
    evaluate_print('Combination by Maximization', y_test, y_by_maximization)
Example #35
0
class TestKnn(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = KNN(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass