def weighted_centering(X:np.ndarray, metric:str='cosine', gamma:float=1., test_set_mask:np.ndarray=None): """ Perform weighted centering: shift origin to the weighted data mean Move the origin more actively towards hub objects in the dataset, rather than towards the data centroid [1]_. Parameters ---------- X : ndarray An ``m x n`` vector data matrix with ``n`` objects in an ``m`` dimensional feature space metric : {'cosine', 'euclidean'}, optional (default: 'cosine') Distance measure used to place more weight on objects that are more likely to become hubs. (Defined for 'cosine' in [1]_, 'euclidean' does not make much sense and might be removed in the future). gamma : float, optional (default: 1.0) Controls how much we emphasize the weighting effect - ``gamma=0`` : equivalent to normal centering - ``gamma>0`` : move origin closer to objects with larger similarity to other objects test_set_mask : ndarray, optional (default: None) Hold back data as a test set and perform centering on the remaining data (training set). Returns ------- X_wcent : ndarray Weighted centered vectors. References ---------- .. [1] Suzuki, I., Hara, K., Shimbo, M., Saerens, M., & Fukumizu, K. (2013). Centering similarity measures to reduce hubs. In Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing (pp 613–623). Retrieved from https://www.aclweb.org/anthology/D/D13/D13-1058.pdf """ n = X.shape[0] # Indices of training examples if test_set_mask is not None: train_set_mask = np.setdiff1d(np.arange(n), test_set_mask) else: train_set_mask = slice(0, n) n_train = X[train_set_mask].shape[0] d = np.zeros(n) if metric == 'cosine': vectors_sum = X[train_set_mask].sum(0) for i in np.arange(n): d[i] = n_train * cos(np.array([X[i], vectors_sum/n_train]))[0, 1] # Using euclidean distances does not really make sense elif metric == 'euclidean': for i in range(n): displ_v = X[train_set_mask] - d[i] d[i] = np.sum(np.sqrt(displ_v * displ_v)) else: raise ValueError("Weighted centering only supports cosine distances.") d_sum = np.sum(d ** gamma) w = (d ** gamma) / d_sum vectors_mean_weighted = np.sum(w.reshape(n, 1) * X, 0) X_wcent = X - vectors_mean_weighted return X_wcent
def localized_centering(X:np.ndarray, metric:str='cosine', kappa:int=40, gamma:float=1., test_set_mask:np.ndarray=None): """ Perform localized centering. Reduce hubness in datasets according to the method proposed in [2]_. Parameters ---------- X : ndarray An ``m x n`` vector data matrix with ``n`` objects in an ``m`` dimensional feature space metric : {'cosine', 'euclidean'} Distance measure used to place more weight on objects that are more likely to become hubs. (Defined for 'cosine' in [1]_, 'euclidean' does not make much sense and might be removed in the future). kappa : int, optional (default: 40) Local segment size, determines the size of the local neighborhood for calculating the local affinity. When ``kappa=n`` localized centering reduces to standard centering. "select κ depending on the dataset, so that the correlation between Nk(x) and the local affinity <x, cκ(x)> is maximized" [2]_ gamma : float, optional (default: 1.0) Control the degree of penalty, so that used the similarity score is smaller depending on how likely a point is to become a hub. "Parameter γ can be tuned so as to maximally reduce the skewness of the Nk distribution" [2]_. test_set_mask : ndarray, optional (default: None) Hold back data as a test set and perform centering on the remaining data (training set). Returns ------- S_lcent : ndarray Secondary similarity (localized centering) matrix. References ---------- .. [1] Suzuki, I., Hara, K., Shimbo, M., Saerens, M., & Fukumizu, K. (2013). Centering similarity measures to reduce hubs. In Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing (pp 613–623). Retrieved from https://www.aclweb.org/anthology/D/D13/D13-1058.pdf .. [2] Hara, K., Suzuki, I., Shimbo, M., Kobayashi, K., Fukumizu, K., & Radovanović, M. (2015). Localized centering: Reducing hubness in large-sample data hubness in high-dimensional data. In AAAI ’15: Proceedings of the 29th AAAI Conference on Artificial Intelligence (pp. 2645–2651). """ if test_set_mask is None: test_set_mask = np.zeros(X.shape[0], np.bool) if metric == 'cosine': # Rescale vectors to unit length v = X / np.sqrt((X ** 2).sum(-1))[..., np.newaxis] # for unit vectors it holds inner() == cosine() sim = 1 - cos(v) # Localized centering meaningful for Euclidean? elif metric == 'euclidean': v = X # no scaling here... sim = 1 / (1 + l2(v)) else: raise ValueError("Localized centering only supports cosine distances.") n = sim.shape[0] local_affinity = np.zeros(n) for i in range(n): x = v[i] sim_i = sim[i, :].copy() # set similarity of test examples to zero to exclude them from fit sim_i[test_set_mask] = 0 # also exclude self sim_i[i] = 0 nn = np.argsort(sim_i)[::-1][1 : kappa+1] c_kappa_x = np.mean(v[nn], 0) if metric == 'cosine': # c_kappa_x has no unit length in general local_affinity[i] = np.inner(x, c_kappa_x) #local_affinity[i] = cosine(x, c_kappa_x) elif metric == 'euclidean': local_affinity[i] = 1 / (1 + np.linalg.norm(x-c_kappa_x)) else: raise ValueError("Localized centering only " "supports cosine distances.") sim_lcent = sim - (local_affinity ** gamma) return sim_lcent