def dis_sim_local(X:np.ndarray, k:int=10, test_set_mask:np.ndarray=None): """Calculate dissimilarity based on local 'sample-wise centrality' [1]_. Parameters ---------- X : ndarray An ``m x n`` vector data matrix with ``n`` objects in an ``m`` dimensional feature space k : int, optional (default: 10) Neighborhood size used for determining the local centroids. Can be optimized as to maximally reduce hubness [1]_. test_set_mask : ndarray, optional (default: None) Hold back data as a test set and perform centering on the remaining data (training set). Returns ------- D_dsl : ndarray Secondary distance (DisSimLocal) matrix. References ---------- .. [1] Hara, K., Suzuki, I., Kobayashi, K., Fukumizu, K., & Radovanović, M. (2016). Flattening the density gradient for eliminating spatial centrality to reduce hubness. Proceedings of the Thirtieth AAAI Conference on Artificial Intelligence (AAAI ’16), 1659–1665. Retrieved from http://www.aaai.org/ocs/index.php/AAAI/ AAAI16/paper/download/12055/11787 """ n = X.shape[0] D = l2(X) # Exclude self distances from kNN lists: np.fill_diagonal(D, np.inf) c_k = np.zeros_like(X) if test_set_mask is not None: train_set_mask = np.setdiff1d(np.arange(n), test_set_mask) for i in range(n): knn_idx = np.argsort(D[i, train_set_mask])[0:k] c_k[i] = X[train_set_mask[knn_idx]].mean(0) else: # take all for i in range(n): knn_idx = np.argsort(D[i, :])[0:k] c_k[i] = X[knn_idx].mean(0) c_k_xy = ((X - c_k) ** 2).sum(1) disSim = np.zeros_like(D) for x in range(n): for y in range(x, n): x_y = ((X[x] - X[y]) ** 2).sum() disSim[x, y] = x_y - c_k_xy[x] - c_k_xy[y] return disSim + disSim.T - np.diag(np.diag(disSim))
def localized_centering(X:np.ndarray, metric:str='cosine', kappa:int=40, gamma:float=1., test_set_mask:np.ndarray=None): """ Perform localized centering. Reduce hubness in datasets according to the method proposed in [2]_. Parameters ---------- X : ndarray An ``m x n`` vector data matrix with ``n`` objects in an ``m`` dimensional feature space metric : {'cosine', 'euclidean'} Distance measure used to place more weight on objects that are more likely to become hubs. (Defined for 'cosine' in [1]_, 'euclidean' does not make much sense and might be removed in the future). kappa : int, optional (default: 40) Local segment size, determines the size of the local neighborhood for calculating the local affinity. When ``kappa=n`` localized centering reduces to standard centering. "select κ depending on the dataset, so that the correlation between Nk(x) and the local affinity <x, cκ(x)> is maximized" [2]_ gamma : float, optional (default: 1.0) Control the degree of penalty, so that used the similarity score is smaller depending on how likely a point is to become a hub. "Parameter γ can be tuned so as to maximally reduce the skewness of the Nk distribution" [2]_. test_set_mask : ndarray, optional (default: None) Hold back data as a test set and perform centering on the remaining data (training set). Returns ------- S_lcent : ndarray Secondary similarity (localized centering) matrix. References ---------- .. [1] Suzuki, I., Hara, K., Shimbo, M., Saerens, M., & Fukumizu, K. (2013). Centering similarity measures to reduce hubs. In Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing (pp 613–623). Retrieved from https://www.aclweb.org/anthology/D/D13/D13-1058.pdf .. [2] Hara, K., Suzuki, I., Shimbo, M., Kobayashi, K., Fukumizu, K., & Radovanović, M. (2015). Localized centering: Reducing hubness in large-sample data hubness in high-dimensional data. In AAAI ’15: Proceedings of the 29th AAAI Conference on Artificial Intelligence (pp. 2645–2651). """ if test_set_mask is None: test_set_mask = np.zeros(X.shape[0], np.bool) if metric == 'cosine': # Rescale vectors to unit length v = X / np.sqrt((X ** 2).sum(-1))[..., np.newaxis] # for unit vectors it holds inner() == cosine() sim = 1 - cos(v) # Localized centering meaningful for Euclidean? elif metric == 'euclidean': v = X # no scaling here... sim = 1 / (1 + l2(v)) else: raise ValueError("Localized centering only supports cosine distances.") n = sim.shape[0] local_affinity = np.zeros(n) for i in range(n): x = v[i] sim_i = sim[i, :].copy() # set similarity of test examples to zero to exclude them from fit sim_i[test_set_mask] = 0 # also exclude self sim_i[i] = 0 nn = np.argsort(sim_i)[::-1][1 : kappa+1] c_kappa_x = np.mean(v[nn], 0) if metric == 'cosine': # c_kappa_x has no unit length in general local_affinity[i] = np.inner(x, c_kappa_x) #local_affinity[i] = cosine(x, c_kappa_x) elif metric == 'euclidean': local_affinity[i] = 1 / (1 + np.linalg.norm(x-c_kappa_x)) else: raise ValueError("Localized centering only " "supports cosine distances.") sim_lcent = sim - (local_affinity ** gamma) return sim_lcent