def trian_out_of_date(format_dir, model_dir):
    """
    聚类去除参考文献时间与文献发表时间差的时间的异常点
    参考文献
    http://scikit-learn.org/stable/auto_examples/neighbors/plot_lof.html#sphx-glr-auto-examples-neighbors-plot-lof-py
    https://zhuanlan.zhihu.com/p/37753692
    """
    print("[START] train out of date")
    origin_dates = []
    for paper in os.listdir(format_dir):
        with open(format_dir + paper, 'r', encoding="utf-8") as format_data:
            _format_data = format_data.read()
            _format_data = json.loads(_format_data)
            origin_dates += _format_data["out_of_date"]
    # 一维数据转二维
    origin_dates = list(zip(origin_dates, np.zeros_like(origin_dates)))
    clf = LocalOutlierFactor(n_neighbors=20)
    # fit非监督训练
    clf.fit(origin_dates)
    # 离群系数
    outlier = clf.kneighbors(origin_dates)[0].max(axis=1)
    # 参考文献发表日期 与 文献发表日期的时间差中 最后一个非离群点
    out_of_date = 0
    for i, _d in enumerate(origin_dates):
        if outlier[i] == 0 and _d[0] > out_of_date:
            out_of_date = _d[0]

    with open(model_dir + 'outlier.txt', 'w') as data:
        data.write(str(out_of_date))
    print("[DONE] train out of date")
def LOF(data, predict, k):
    
    clf = LocalOutlierFactor(n_neighbors=k+1, algorithm='auto', contamination=0.1,n_jobs=-1)
    clf.fit(data)
    predict['k distances'] = clf.kneighbors(predict)[0].max(axis=1)
    predict['local outlier factor'] = -clf._decision_function(predict.iloc[:,:-1])
    
    return predict
Exemple #3
0
def localoutlierfactor(data, predict, k):
    lof_clf = LocalOutlierFactor(n_neighbors=k + 1,
                                 contamination=0.2,
                                 n_jobs=-1)
    lof_clf.fit(data)
    # 记录 k 邻域距离
    predict['k_distances'] = lof_clf.kneighbors(predict)[0].max(axis=1)
    # 记录 LOF 离群因子,做相反数处理
    predict['local_outlier_factor'] = -lof_clf._decision_function(
        predict.iloc[:, :-1])
    return predict
def LOF_anomaly_score(x):
    """To figure out anomaly scores."""
    # must calibrate it for all measurements
    outliers = []
    outliers_list = []
    for i, j in x:
        pd_i = pd.DataFrame(i)
        method = 1
        k = 30
        clf = LocalOutlierFactor(n_neighbors=k,
                                 algorithm='auto',
                                 contamination=0.1,
                                 n_jobs=-1)
        clf.fit(pd_i)
        # Record k neighborhood distance
        pd_i['k distances'] = clf.kneighbors(pd_i)[0].max(axis=1)
        # Record LOF factor,take negative
        pd_i['local outlier factor'] = -clf._decision_function(
            pd_i.iloc[:, :-1])
        # Separate group points and normal points according to the threshold
        outliers = pd_i[pd_i['local outlier factor'] > method].sort_values(
            by='local outlier factor')
        inliers = pd_i[pd_i['local outlier factor'] <= method].sort_values(
            by='local outlier factor')
        # Figure
        plt.rcParams['axes.unicode_minus'] = False  # display the negative sign
        plt.figure(figsize=(8, 4)).add_subplot(111)
        plt.scatter(pd_i[pd_i['local outlier factor'] > method].index,
                    pd_i[pd_i['local outlier factor'] > method]
                    ['local outlier factor'],
                    c='red',
                    s=50,
                    marker='.',
                    alpha=None,
                    label='outliers')
        plt.scatter(pd_i[pd_i['local outlier factor'] <= method].index,
                    pd_i[pd_i['local outlier factor'] <= method]
                    ['local outlier factor'],
                    c='black',
                    s=50,
                    marker='.',
                    alpha=None,
                    label='inliers')
        plt.hlines(method, -2, 2 + max(pd_i.index), linestyles='--')
        plt.xlim(-2, 2 + max(pd_i.index))
        plt.title(f'LOF Local outlier detection of {j}', fontsize=13)
        plt.ylabel('Anamoly Score', fontsize=15)  # Local outlier Factors
        plt.legend()
        plt.savefig(f'LOF_images/LOF_{j}', format='png', dpi=1200)
        plt.show()
        outliers_list.append(list(outliers.index))
    return outliers_list
Exemple #5
0
def localoutlierfactor(data, predict, k):
    from sklearn.neighbors import LocalOutlierFactor
    clf = LocalOutlierFactor(n_neighbors=k + 1,
                             algorithm='auto',
                             contamination=0.1,
                             n_jobs=-1)
    clf.fit(data)
    # 记录 k 邻域距离
    predict['k distances'] = clf.kneighbors(predict)[0].max(axis=1)
    # 记录 LOF 离群因子,做相反数处理
    predict['local outlier factor'] = -clf._decision_function(
        predict.iloc[:, :-1])
    return predict
Exemple #6
0
def LOF(datastream, kpar):
    #not sure to use euclidean or minkowski
    Points = Point()
    clf = LocalOutlierFactor(n_neighbors=kpar,
                             algorithm="kd_tree",
                             leaf_size=30,
                             metric='euclidean')
    clf.fit(datastream)
    Points.LOF = [-x for x in clf.negative_outlier_factor_.tolist()]
    Points.lrd = clf._lrd.tolist()
    dist, ind = clf.kneighbors()
    Points.kdist = dist.tolist()
    Points.knn = ind.tolist()
    return Points
def localOutlierFactor(data, predict, k):

    # LOF
    clf = LocalOutlierFactor(n_neighbors=k + 1,
                             algorithm='auto',
                             contamination=0.1,
                             n_jobs=-1)
    clf.fit(data)

    # Computer k-neatest-point distance
    predict['k distances'] = clf.kneighbors(predict)[0].max(axis=1)

    # Record LOF,process negative values
    predict['local outlier factor'] = -clf._decision_function(
        predict.iloc[:, :-1])
    return predict
def calc_max_lof_of_bounds(document_topics, metric, k_min, k_max):    
    num_docs = document_topics.shape[0]
    max_outlier_scores = np.zeros(num_docs)
    logger.info('calculating max outlier scores: k_min={}, k_max={}'.format(k_min, k_max))    
    model = LocalOutlierFactor(n_neighbors=k_max, metric=metric, n_jobs=3)
    logger.info('creating k_max-model {}'.format(model))
    model.fit(document_topics)
    neighbors_distances_k_max, neighbors_indices_k_max = (model.kneighbors(None, n_neighbors=k_max))
    max_outlier_scores = np.zeros(num_docs)
    for k in range(k_min, k_max+1):
        logger.debug('calculating {}-scores'.format(k))
        neighbors_distances = neighbors_distances_k_max[:,0:k]
        neighbors_indices = neighbors_indices_k_max[:,0:k]
        outlier_scores = local_outlier_factor(neighbors_distances, neighbors_indices, k)
        max_outlier_scores = np.maximum(max_outlier_scores, outlier_scores)
    return max_outlier_scores
        cluster_stds = numpy.random.uniform(low=0, high=10, size=[nr_clusters, nr_features]).astype(numpy.float64)

    dataset = make_dataset(nr_clusters, cluster_stds, nr_samples, nr_features)
    assert dataset.shape == (nr_samples, nr_features), "dimension mismatch"

    clf = LocalOutlierFactor(n_neighbors=nr_neighbors, algorithm="brute", metric="euclidean")
    start_time = time.clock()
    for i in range(0, 5):
        clf.fit_predict(dataset)
    exec_time = time.clock() - start_time
    print("--- %s seconds ---" % (exec_time / 5))

    with open("Execution_time.txt", "a") as f:
        f.write("{},{},{},{}\n".format(nr_samples, nr_features, nr_neighbors, exec_time / 5))

    neigh_dist, neigh_ind = clf.kneighbors()

    assert os.getcwd().split("/")[-1] == "team025", "wrighting to the wrong directory, run from team025 root"
    dir_name = "./data/n{}_k{}_dim{}".format(int(nr_samples), int(nr_neighbors), int(nr_features))
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

    print("Writing to directory: {}".format(dir_name))
    with open("{}/metadata.txt".format(dir_name), "w+") as f:
        f.write("Nr points, dim, nr neigh\n")
        f.write("{},{},{}\n".format(nr_samples, nr_features, nr_neighbors))
        f.write("Min per dimension\n")
        s = ', '.join(str(dim) for dim in numpy.min(dataset, axis=0))
        f.write(s)
        f.write("\nMax per dimension\n")
        s = ', '.join(str(dim) for dim in numpy.max(dataset, axis=0))
class ApplicabilityDomain():
    def __init__(self,
                 method_name='ocsvm',
                 rate_of_outliers=0.01,
                 gamma='auto',
                 nu=0.5,
                 n_neighbors=10,
                 metric='minkowski',
                 p=2):
        """
        Applicability Domain (AD)
        
        Parameters
        ----------
        method_name: str, default 'ocsvm'
            The name of method to set AD. 'knn', 'lof', or 'ocsvm'
        rate_of_outliers: float, default 0.01
            Rate of outlier samples. This is used to set threshold
        gamma : (only for 'ocsvm') float, default ’auto’
            Kernel coefficient for ‘rbf’. Current default is ‘auto’ which optimize gamma to maximize variance in Gram matrix
        nu : (only for 'ocsvm') float, default 0.5
            An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]. By default 0.5 will be taken.
            https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html#sklearn.svm.OneClassSVM
        n_neighbors: (only for 'knn' and 'lof') int, default 10
            Number of neighbors to use for each query
            https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
        metric : string or callable, default ‘minkowski’
            Metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used.
            https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
        p : integer, default 2
            Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
            https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
        """

        if method_name != 'knn' and method_name != 'lof' and method_name != 'ocsvm':
            sys.exit(
                'There is no ad method named \'{0}\'. Please check the variable of method_name.'
                .format(method_name))

        self.method_name = method_name
        self.rate_of_outliers = rate_of_outliers
        self.gamma = gamma
        self.nu = nu
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.p = p

    def fit(self, x):
        """
        Applicability Domain (AD)
        
        Set AD
    
        Parameters
        ----------
        x : numpy.array or pandas.DataFrame
            m x n matrix of X-variables of training data,
            m is the number of training sammples and
            n is the number of X-variables
        """

        x = np.array(x)

        if self.method_name == 'ocsvm':
            if self.gamma == 'auto':
                ocsvm_gammas = 2**np.arange(-20, 11, dtype=float)
                variance_of_gram_matrix = []
                for index, ocsvm_gamma in enumerate(ocsvm_gammas):
                    gram_matrix = np.exp(-ocsvm_gamma *
                                         cdist(x, x, metric='seuclidean'))
                    variance_of_gram_matrix.append(gram_matrix.var(ddof=1))
                self.optimal_gamma = ocsvm_gammas[
                    variance_of_gram_matrix.index(
                        max(variance_of_gram_matrix))]
            else:
                self.optimal_gamma = self.gamma
            self.ad = OneClassSVM(kernel='rbf',
                                  gamma=self.optimal_gamma,
                                  nu=self.nu)
            self.ad.fit(x)
            ad_values = np.ndarray.flatten(self.ad.decision_function(x))

        elif self.method_name == 'knn':
            self.ad = NearestNeighbors(n_neighbors=self.n_neighbors)
            self.ad.fit(x)
            knn_dist_all, knn_ind_all = self.ad.kneighbors(None)
            ad_values = 1 / (knn_dist_all.mean(axis=1) + 1)
        elif self.method_name == 'lof':
            self.ad = LocalOutlierFactor(novelty=True,
                                         contamination=self.rate_of_outliers)
            self.ad.fit(x)
            ad_values = self.ad.negative_outlier_factor_ - self.ad.offset_

        self.offset = np.percentile(ad_values, 100 * self.rate_of_outliers)

    def predict(self, x):
        """
        Applicability Domain (AD)
        
        Predict AD-values 
    
        Parameters
        ----------
        x : numpy.array or pandas.DataFrame
            k x n matrix of X-variables of test data, which is autoscaled with training data,
            and k is the number of test samples
    
        Returns
        -------
        ad_values : numpy.array, shape (n_samples,)
            values lower than 0 means outside of AD
        """

        x = np.array(x)

        if self.method_name == 'ocsvm':
            ad_values = np.ndarray.flatten(self.ad.decision_function(x))

        elif self.method_name == 'knn':
            knn_dist_all, knn_ind_all = self.ad.kneighbors(x)
            ad_values = 1 / (knn_dist_all.mean(axis=1) + 1)
        elif self.method_name == 'lof':
            ad_values = np.ndarray.flatten(self.ad.decision_function(x))

        return ad_values - self.offset