Example #1
0
def dis_sim_local(X:np.ndarray, k:int=10, test_set_mask:np.ndarray=None):
    """Calculate dissimilarity based on local 'sample-wise centrality' [1]_.
    
    Parameters
    ----------
    X : ndarray
        An ``m x n`` vector data matrix with ``n`` objects in an 
        ``m`` dimensional feature space
          
    k : int, optional (default: 10)
        Neighborhood size used for determining the local centroids.
        Can be optimized as to maximally reduce hubness [1]_.
          
    test_set_mask : ndarray, optional (default: None)
        Hold back data as a test set and perform centering on the remaining 
        data (training set).
        
    Returns
    -------
    D_dsl : ndarray
        Secondary distance (DisSimLocal) matrix.
        
    References
    ----------
    .. [1] Hara, K., Suzuki, I., Kobayashi, K., Fukumizu, K., & 
           Radovanović, M. (2016). Flattening the density gradient for 
           eliminating spatial centrality to reduce hubness. Proceedings of 
           the Thirtieth AAAI Conference on Artificial Intelligence (AAAI ’16), 
           1659–1665. Retrieved from http://www.aaai.org/ocs/index.php/AAAI/
           AAAI16/paper/download/12055/11787
    """
    
    n = X.shape[0]
    D = l2(X)
    # Exclude self distances from kNN lists:
    np.fill_diagonal(D, np.inf)
    c_k = np.zeros_like(X)
    
    if test_set_mask is not None:
        train_set_mask = np.setdiff1d(np.arange(n), test_set_mask)
        for i in range(n):
            knn_idx = np.argsort(D[i, train_set_mask])[0:k]
            c_k[i] = X[train_set_mask[knn_idx]].mean(0)
    else: # take all
        for i in range(n):
            knn_idx = np.argsort(D[i, :])[0:k]
            c_k[i] = X[knn_idx].mean(0)
    c_k_xy = ((X - c_k) ** 2).sum(1)
    disSim = np.zeros_like(D)
    for x in range(n):
        for y in range(x, n):
            x_y = ((X[x] - X[y]) ** 2).sum()
            disSim[x, y] = x_y - c_k_xy[x] - c_k_xy[y]
    return disSim + disSim.T - np.diag(np.diag(disSim))
Example #2
0
def localized_centering(X:np.ndarray, metric:str='cosine', kappa:int=40, 
                        gamma:float=1., test_set_mask:np.ndarray=None):
    """
    Perform localized centering.
    
    Reduce hubness in datasets according to the method proposed in [2]_.
    
    Parameters
    ----------
    X : ndarray
        An ``m x n`` vector data matrix with ``n`` objects in an 
        ``m`` dimensional feature space 
        
    metric : {'cosine', 'euclidean'}
        Distance measure used to place more weight on objects that are more 
        likely to become hubs. (Defined for 'cosine' in [1]_, 'euclidean' does 
        not make much sense and might be removed in the future).
        
    kappa : int, optional (default: 40)
        Local segment size, determines the size of the local neighborhood for 
        calculating the local affinity. When ``kappa=n`` localized centering 
        reduces to standard centering.
        "select κ depending on the dataset, so that the correlation between
        Nk(x) and the local affinity <x, cκ(x)> is maximized" [2]_
        
    gamma : float, optional (default: 1.0)
        Control the degree of penalty, so that used the similarity score 
        is smaller depending on how likely a point is to become a hub.
        "Parameter γ can be tuned so as to maximally reduce the skewness 
        of the Nk distribution" [2]_.
        
    test_set_mask : ndarray, optional (default: None)
        Hold back data as a test set and perform centering on the remaining 
        data (training set).
    
    Returns
    ------- 
    S_lcent : ndarray
        Secondary similarity (localized centering) matrix.
        
    References
    ----------
    .. [1] Suzuki, I., Hara, K., Shimbo, M., Saerens, M., & Fukumizu, K. (2013). 
           Centering similarity measures to reduce hubs. In Proceedings of the 
           2013 Conference on Empirical Methods in Natural Language Processing 
           (pp 613–623). 
           Retrieved from https://www.aclweb.org/anthology/D/D13/D13-1058.pdf
    
    .. [2] Hara, K., Suzuki, I., Shimbo, M., Kobayashi, K., Fukumizu, K., & 
           Radovanović, M. (2015). Localized centering: Reducing hubness in 
           large-sample data hubness in high-dimensional data. In AAAI ’15: 
           Proceedings of the 29th AAAI Conference on Artificial Intelligence 
           (pp. 2645–2651).
    """
    if test_set_mask is None:
        test_set_mask = np.zeros(X.shape[0], np.bool)
    
    if metric == 'cosine':
        # Rescale vectors to unit length
        v = X / np.sqrt((X ** 2).sum(-1))[..., np.newaxis]
        # for unit vectors it holds inner() == cosine()
        sim = 1 - cos(v)
    # Localized centering meaningful for Euclidean?
    elif metric == 'euclidean':
        v = X # no scaling here...
        sim = 1 / (1 + l2(v))
    else:
        raise ValueError("Localized centering only supports cosine distances.")
    
    n = sim.shape[0]
    local_affinity = np.zeros(n)
    for i in range(n):
        x = v[i]
        sim_i = sim[i, :].copy()
        # set similarity of test examples to zero to exclude them from fit
        sim_i[test_set_mask] = 0
        # also exclude self
        sim_i[i] = 0
        nn = np.argsort(sim_i)[::-1][1 : kappa+1]
        c_kappa_x = np.mean(v[nn], 0)
        if metric == 'cosine':
            # c_kappa_x has no unit length in general
            local_affinity[i] = np.inner(x, c_kappa_x)
            #local_affinity[i] = cosine(x, c_kappa_x)
        elif metric == 'euclidean':
            local_affinity[i] = 1 / (1 + np.linalg.norm(x-c_kappa_x))
        else:
            raise ValueError("Localized centering only "
                             "supports cosine distances.")
    sim_lcent = sim - (local_affinity ** gamma)
    return sim_lcent