def trian_out_of_date(format_dir, model_dir): """ 聚类去除参考文献时间与文献发表时间差的时间的异常点 参考文献 http://scikit-learn.org/stable/auto_examples/neighbors/plot_lof.html#sphx-glr-auto-examples-neighbors-plot-lof-py https://zhuanlan.zhihu.com/p/37753692 """ print("[START] train out of date") origin_dates = [] for paper in os.listdir(format_dir): with open(format_dir + paper, 'r', encoding="utf-8") as format_data: _format_data = format_data.read() _format_data = json.loads(_format_data) origin_dates += _format_data["out_of_date"] # 一维数据转二维 origin_dates = list(zip(origin_dates, np.zeros_like(origin_dates))) clf = LocalOutlierFactor(n_neighbors=20) # fit非监督训练 clf.fit(origin_dates) # 离群系数 outlier = clf.kneighbors(origin_dates)[0].max(axis=1) # 参考文献发表日期 与 文献发表日期的时间差中 最后一个非离群点 out_of_date = 0 for i, _d in enumerate(origin_dates): if outlier[i] == 0 and _d[0] > out_of_date: out_of_date = _d[0] with open(model_dir + 'outlier.txt', 'w') as data: data.write(str(out_of_date)) print("[DONE] train out of date")
def LOF(data, predict, k): clf = LocalOutlierFactor(n_neighbors=k+1, algorithm='auto', contamination=0.1,n_jobs=-1) clf.fit(data) predict['k distances'] = clf.kneighbors(predict)[0].max(axis=1) predict['local outlier factor'] = -clf._decision_function(predict.iloc[:,:-1]) return predict
def localoutlierfactor(data, predict, k): lof_clf = LocalOutlierFactor(n_neighbors=k + 1, contamination=0.2, n_jobs=-1) lof_clf.fit(data) # 记录 k 邻域距离 predict['k_distances'] = lof_clf.kneighbors(predict)[0].max(axis=1) # 记录 LOF 离群因子,做相反数处理 predict['local_outlier_factor'] = -lof_clf._decision_function( predict.iloc[:, :-1]) return predict
def LOF_anomaly_score(x): """To figure out anomaly scores.""" # must calibrate it for all measurements outliers = [] outliers_list = [] for i, j in x: pd_i = pd.DataFrame(i) method = 1 k = 30 clf = LocalOutlierFactor(n_neighbors=k, algorithm='auto', contamination=0.1, n_jobs=-1) clf.fit(pd_i) # Record k neighborhood distance pd_i['k distances'] = clf.kneighbors(pd_i)[0].max(axis=1) # Record LOF factor,take negative pd_i['local outlier factor'] = -clf._decision_function( pd_i.iloc[:, :-1]) # Separate group points and normal points according to the threshold outliers = pd_i[pd_i['local outlier factor'] > method].sort_values( by='local outlier factor') inliers = pd_i[pd_i['local outlier factor'] <= method].sort_values( by='local outlier factor') # Figure plt.rcParams['axes.unicode_minus'] = False # display the negative sign plt.figure(figsize=(8, 4)).add_subplot(111) plt.scatter(pd_i[pd_i['local outlier factor'] > method].index, pd_i[pd_i['local outlier factor'] > method] ['local outlier factor'], c='red', s=50, marker='.', alpha=None, label='outliers') plt.scatter(pd_i[pd_i['local outlier factor'] <= method].index, pd_i[pd_i['local outlier factor'] <= method] ['local outlier factor'], c='black', s=50, marker='.', alpha=None, label='inliers') plt.hlines(method, -2, 2 + max(pd_i.index), linestyles='--') plt.xlim(-2, 2 + max(pd_i.index)) plt.title(f'LOF Local outlier detection of {j}', fontsize=13) plt.ylabel('Anamoly Score', fontsize=15) # Local outlier Factors plt.legend() plt.savefig(f'LOF_images/LOF_{j}', format='png', dpi=1200) plt.show() outliers_list.append(list(outliers.index)) return outliers_list
def localoutlierfactor(data, predict, k): from sklearn.neighbors import LocalOutlierFactor clf = LocalOutlierFactor(n_neighbors=k + 1, algorithm='auto', contamination=0.1, n_jobs=-1) clf.fit(data) # 记录 k 邻域距离 predict['k distances'] = clf.kneighbors(predict)[0].max(axis=1) # 记录 LOF 离群因子,做相反数处理 predict['local outlier factor'] = -clf._decision_function( predict.iloc[:, :-1]) return predict
def LOF(datastream, kpar): #not sure to use euclidean or minkowski Points = Point() clf = LocalOutlierFactor(n_neighbors=kpar, algorithm="kd_tree", leaf_size=30, metric='euclidean') clf.fit(datastream) Points.LOF = [-x for x in clf.negative_outlier_factor_.tolist()] Points.lrd = clf._lrd.tolist() dist, ind = clf.kneighbors() Points.kdist = dist.tolist() Points.knn = ind.tolist() return Points
def localOutlierFactor(data, predict, k): # LOF clf = LocalOutlierFactor(n_neighbors=k + 1, algorithm='auto', contamination=0.1, n_jobs=-1) clf.fit(data) # Computer k-neatest-point distance predict['k distances'] = clf.kneighbors(predict)[0].max(axis=1) # Record LOF,process negative values predict['local outlier factor'] = -clf._decision_function( predict.iloc[:, :-1]) return predict
def calc_max_lof_of_bounds(document_topics, metric, k_min, k_max): num_docs = document_topics.shape[0] max_outlier_scores = np.zeros(num_docs) logger.info('calculating max outlier scores: k_min={}, k_max={}'.format(k_min, k_max)) model = LocalOutlierFactor(n_neighbors=k_max, metric=metric, n_jobs=3) logger.info('creating k_max-model {}'.format(model)) model.fit(document_topics) neighbors_distances_k_max, neighbors_indices_k_max = (model.kneighbors(None, n_neighbors=k_max)) max_outlier_scores = np.zeros(num_docs) for k in range(k_min, k_max+1): logger.debug('calculating {}-scores'.format(k)) neighbors_distances = neighbors_distances_k_max[:,0:k] neighbors_indices = neighbors_indices_k_max[:,0:k] outlier_scores = local_outlier_factor(neighbors_distances, neighbors_indices, k) max_outlier_scores = np.maximum(max_outlier_scores, outlier_scores) return max_outlier_scores
cluster_stds = numpy.random.uniform(low=0, high=10, size=[nr_clusters, nr_features]).astype(numpy.float64) dataset = make_dataset(nr_clusters, cluster_stds, nr_samples, nr_features) assert dataset.shape == (nr_samples, nr_features), "dimension mismatch" clf = LocalOutlierFactor(n_neighbors=nr_neighbors, algorithm="brute", metric="euclidean") start_time = time.clock() for i in range(0, 5): clf.fit_predict(dataset) exec_time = time.clock() - start_time print("--- %s seconds ---" % (exec_time / 5)) with open("Execution_time.txt", "a") as f: f.write("{},{},{},{}\n".format(nr_samples, nr_features, nr_neighbors, exec_time / 5)) neigh_dist, neigh_ind = clf.kneighbors() assert os.getcwd().split("/")[-1] == "team025", "wrighting to the wrong directory, run from team025 root" dir_name = "./data/n{}_k{}_dim{}".format(int(nr_samples), int(nr_neighbors), int(nr_features)) if not os.path.exists(dir_name): os.makedirs(dir_name) print("Writing to directory: {}".format(dir_name)) with open("{}/metadata.txt".format(dir_name), "w+") as f: f.write("Nr points, dim, nr neigh\n") f.write("{},{},{}\n".format(nr_samples, nr_features, nr_neighbors)) f.write("Min per dimension\n") s = ', '.join(str(dim) for dim in numpy.min(dataset, axis=0)) f.write(s) f.write("\nMax per dimension\n") s = ', '.join(str(dim) for dim in numpy.max(dataset, axis=0))
class ApplicabilityDomain(): def __init__(self, method_name='ocsvm', rate_of_outliers=0.01, gamma='auto', nu=0.5, n_neighbors=10, metric='minkowski', p=2): """ Applicability Domain (AD) Parameters ---------- method_name: str, default 'ocsvm' The name of method to set AD. 'knn', 'lof', or 'ocsvm' rate_of_outliers: float, default 0.01 Rate of outlier samples. This is used to set threshold gamma : (only for 'ocsvm') float, default ’auto’ Kernel coefficient for ‘rbf’. Current default is ‘auto’ which optimize gamma to maximize variance in Gram matrix nu : (only for 'ocsvm') float, default 0.5 An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]. By default 0.5 will be taken. https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html#sklearn.svm.OneClassSVM n_neighbors: (only for 'knn' and 'lof') int, default 10 Number of neighbors to use for each query https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html metric : string or callable, default ‘minkowski’ Metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html p : integer, default 2 Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html """ if method_name != 'knn' and method_name != 'lof' and method_name != 'ocsvm': sys.exit( 'There is no ad method named \'{0}\'. Please check the variable of method_name.' .format(method_name)) self.method_name = method_name self.rate_of_outliers = rate_of_outliers self.gamma = gamma self.nu = nu self.n_neighbors = n_neighbors self.metric = metric self.p = p def fit(self, x): """ Applicability Domain (AD) Set AD Parameters ---------- x : numpy.array or pandas.DataFrame m x n matrix of X-variables of training data, m is the number of training sammples and n is the number of X-variables """ x = np.array(x) if self.method_name == 'ocsvm': if self.gamma == 'auto': ocsvm_gammas = 2**np.arange(-20, 11, dtype=float) variance_of_gram_matrix = [] for index, ocsvm_gamma in enumerate(ocsvm_gammas): gram_matrix = np.exp(-ocsvm_gamma * cdist(x, x, metric='seuclidean')) variance_of_gram_matrix.append(gram_matrix.var(ddof=1)) self.optimal_gamma = ocsvm_gammas[ variance_of_gram_matrix.index( max(variance_of_gram_matrix))] else: self.optimal_gamma = self.gamma self.ad = OneClassSVM(kernel='rbf', gamma=self.optimal_gamma, nu=self.nu) self.ad.fit(x) ad_values = np.ndarray.flatten(self.ad.decision_function(x)) elif self.method_name == 'knn': self.ad = NearestNeighbors(n_neighbors=self.n_neighbors) self.ad.fit(x) knn_dist_all, knn_ind_all = self.ad.kneighbors(None) ad_values = 1 / (knn_dist_all.mean(axis=1) + 1) elif self.method_name == 'lof': self.ad = LocalOutlierFactor(novelty=True, contamination=self.rate_of_outliers) self.ad.fit(x) ad_values = self.ad.negative_outlier_factor_ - self.ad.offset_ self.offset = np.percentile(ad_values, 100 * self.rate_of_outliers) def predict(self, x): """ Applicability Domain (AD) Predict AD-values Parameters ---------- x : numpy.array or pandas.DataFrame k x n matrix of X-variables of test data, which is autoscaled with training data, and k is the number of test samples Returns ------- ad_values : numpy.array, shape (n_samples,) values lower than 0 means outside of AD """ x = np.array(x) if self.method_name == 'ocsvm': ad_values = np.ndarray.flatten(self.ad.decision_function(x)) elif self.method_name == 'knn': knn_dist_all, knn_ind_all = self.ad.kneighbors(x) ad_values = 1 / (knn_dist_all.mean(axis=1) + 1) elif self.method_name == 'lof': ad_values = np.ndarray.flatten(self.ad.decision_function(x)) return ad_values - self.offset