def local_outlier_detection(training_vectors, test_vectors_clean, test_vectors_anomalous): """Predicting outliers using Local Outlier Detection """ print("Starting Local Outlier Fitting...") # Fitting model for novel predictions lof = LocalOutlierFactor(novelty=True, contamination='auto', algorithm='auto', n_neighbors=20, n_jobs=-1) print("Fitting with Parameters: ", lof.get_params()) lof.fit(training_vectors) result_training = lof.predict(training_vectors) print("Fitting successful!") print("Starting Prediction...") # Predict returns 1 for inlier and -1 for outlier result_clean = lof.predict(test_vectors_clean) result_anomalous = lof.predict(test_vectors_anomalous) print("Predicting successful!") print("**************************") return result_clean, result_anomalous, result_training
class Outlier(Intent): def __init__(self, n_neighbors: int = 20, contamination: Union[float, str] = 'auto') -> None: super().__init__() self.clf = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination) def to_string(self) -> str: return 'Outlier' def compute(self, df: pd.DataFrame) -> pd.DataFrame: nan_dropped = df.select_dtypes(include=['number']).dropna() min_max_scaler = preprocessing.MinMaxScaler() scaled = min_max_scaler.fit_transform(nan_dropped.values) outliers = self.clf.fit_predict(scaled) result = pd.DataFrame(data=outliers, index=nan_dropped.index, columns=[self.to_string()]).replace({ -1: 1, 1: 0 }) return result.loc[result.iloc[:, 0] == 1].reindex(index=df.index, fill_value=0) def info(self) -> Optional[Dict[str, Any]]: return { "type": "Local Outlier Factory", "params": self.clf.get_params() }
class LocalOutlierFactorFilter: """ 训练与预测一体,没有单独的train和test接口 关键参数:n_neighbors : int, optional (default=20):参与预测的点的数量,无明显规律 contamination": 可以反映过滤强度, 越大过滤强度越大 """ def __init__(self, name="局部异常因子"): self._model = LocalOutlierFactor() self.name = name def get_params(self, deep=True): """ 获得模型参数 """ return self._model.get_params(deep=deep) def _get_valid_params(self): """ 获取有效参数 :return: List """ param = self.get_params() return [i for i in param.keys()] def set_params(self, **new_params): """ 设置模型参数 :param new_params: 模型参数键值 只将模型参数包含的超参赋值给模型 :return: """ for k in new_params.keys(): if k not in self._get_valid_params(): raise ValueError("传入参数含有模型中不包含的参数") break feed_dict = { k: v for k, v in new_params.items() if k in self._get_valid_params() } if len(feed_dict) == 0: warnings.warn("模型参数未被修改") self._model.set_params(**feed_dict) def fit_predict(self, x): pass """ :param x: 训练数据 :param y: 训练数据标签 :return: 训练数据准确率 """ return self._model.fit_predict(x) def _connect_SQL(self, **json_file): """ 连接到SQL :param json_file: 入参 :return:None """ json_dict = json_file self._SQL = SQLServer(host=json_dict['dbinfo']['ip'], port=json_dict['dbinfo']['port'], user=json_dict['dbinfo']['username'], pwd=json_dict['dbinfo']['password'], db=json_dict['dbinfo']['databasename']) def get_data_label(self, **json_file): """ 从数据库调取数据集的标签 :param json_file: :return: 仅含有标签的数据集 pd.dataFrame """ json_dict = json_file data_label = self._SQL.df_read_sqlserver( table=json_dict['dbinfo']['inputtable'], cols=json_dict['label_columns']) if data_label.shape[1] != 1: raise ValueError("错误:标签列数不为1") return data_label def get_data_features(self, **json_file): """ 从数据库调取数据集 :param json_file:入参, json :return: 仅含有特征变量的数据集 pd.dataFrame """ json_dict = json_file data_features = self._SQL.df_read_sqlserver( table=json_dict['dbinfo']['inputtable'], cols=json_dict['data_columns']) return data_features def train_predict_from_sql(self, **json_file): """ 训练模型并将模型保存 :param json_file: 入参,json :return:是否成功 """ try: self._connect_SQL(**json_file) self.set_params(**json_file["model_params"]) features = self.get_data_features(**json_file) pre = self.fit_predict(features) self._model.columns = features.columns.values.tolist() self.save_model(json_file["model_path"]) # 暂时保存 pre.columns = ["label"] pre.to_csv(json_file["save_path"], index=False) write = self.SQL.df_write_sqlserver( table=json_file['dbinfo']['outputtable'], df=pre, cols=json_file['data_columns']) return {"info": write} return "success" except Exception as e: print(e) return 'failed,{e}'.format(e=e) def train_predict_from_csv(self, **json): try: features = pd.read_csv(json["path"], usecols=json['data_columns']) self.set_params(**json["model_params"]) pre = pd.DataFrame(self.fit_predict(features)) self._model.columns = json['data_columns'] self.save_model(json["model_path"]) # 暂时保存 pre.columns = ["label"] pre.to_csv(json["save_path"], index=False) return {"info": "success"} except Exception as e: print(e) return 'failed,{e}'.format(e=e) def train_predict_from_xls(self, **json): try: features = pd.read_excel(json["path"], usecols=json['data_columns']) self.set_params(**json["model_params"]) pre = self.fit_predict(features) self._model.columns = json['data_columns'] self.save_model(json["model_path"]) # 暂时保存 pre.columns = ["label"] pre.to_csv(json["save_path"], index=False) return {"info": "success"} except Exception as e: print(e) return 'failed,{e}'.format(e=e) def save_model(self, model_path): """ 保存模型 :param model_path: 模型保存路径 :return:是否成功 """ try: joblib.dump(self._model, model_path) except Exception as e: print(e) return 'failed,{e}'.format(e=e) def get_model(self): """ 调用模型 :return:模型 """ try: return self._model except Exception as e: print(e) return 'failed,{e}'.format(e=e) def load_model(self, **json): model_path = json['model_path'] self._model = joblib.load(model_path)