Esempio n. 1
0
def weighted_centering(X:np.ndarray, metric:str='cosine', gamma:float=1., 
                       test_set_mask:np.ndarray=None):
    """
    Perform  weighted centering: shift origin to the weighted data mean
    
    Move the origin more actively towards hub objects in the dataset, 
    rather than towards the data centroid [1]_.
    
    Parameters
    ----------
    X : ndarray
        An ``m x n`` vector data matrix with ``n`` objects in an 
        ``m`` dimensional feature space 
    
    metric : {'cosine', 'euclidean'}, optional (default: 'cosine')
        Distance measure used to place more weight on objects that are more 
        likely to become hubs. (Defined for 'cosine' in [1]_, 'euclidean' does 
        not make much sense and might be removed in the future).
    
    gamma : float, optional (default: 1.0)
        Controls how much we emphasize the weighting effect
        
        - ``gamma=0`` : equivalent to normal centering
        - ``gamma>0`` : move origin closer to objects with larger similarity 
          to other objects
    
    test_set_mask : ndarray, optional (default: None)
        Hold back data as a test set and perform centering on the remaining 
        data (training set).
    
    Returns
    ------- 
    X_wcent : ndarray
        Weighted centered vectors.
        
    References
    ----------
    .. [1] Suzuki, I., Hara, K., Shimbo, M., Saerens, M., & Fukumizu, K. (2013). 
           Centering similarity measures to reduce hubs. In Proceedings of the 
           2013 Conference on Empirical Methods in Natural Language Processing 
           (pp 613–623). 
           Retrieved from https://www.aclweb.org/anthology/D/D13/D13-1058.pdf
    """
    n = X.shape[0]
                   
    # Indices of training examples
    if test_set_mask is not None:
        train_set_mask = np.setdiff1d(np.arange(n), test_set_mask)
    else:
        train_set_mask = slice(0, n)
    
    n_train = X[train_set_mask].shape[0]
    d = np.zeros(n)
    
    if metric == 'cosine':
        vectors_sum = X[train_set_mask].sum(0)
        for i in np.arange(n):
            d[i] = n_train * cos(np.array([X[i], vectors_sum/n_train]))[0, 1]
    # Using euclidean distances does not really make sense
    elif metric == 'euclidean':
        for i in range(n):
            displ_v = X[train_set_mask] - d[i]
            d[i] = np.sum(np.sqrt(displ_v * displ_v))
    else:
        raise ValueError("Weighted centering only supports cosine distances.")
    d_sum = np.sum(d ** gamma)
    w = (d ** gamma) / d_sum
    vectors_mean_weighted = np.sum(w.reshape(n, 1) * X, 0)
    X_wcent = X - vectors_mean_weighted
    return X_wcent
Esempio n. 2
0
def localized_centering(X:np.ndarray, metric:str='cosine', kappa:int=40, 
                        gamma:float=1., test_set_mask:np.ndarray=None):
    """
    Perform localized centering.
    
    Reduce hubness in datasets according to the method proposed in [2]_.
    
    Parameters
    ----------
    X : ndarray
        An ``m x n`` vector data matrix with ``n`` objects in an 
        ``m`` dimensional feature space 
        
    metric : {'cosine', 'euclidean'}
        Distance measure used to place more weight on objects that are more 
        likely to become hubs. (Defined for 'cosine' in [1]_, 'euclidean' does 
        not make much sense and might be removed in the future).
        
    kappa : int, optional (default: 40)
        Local segment size, determines the size of the local neighborhood for 
        calculating the local affinity. When ``kappa=n`` localized centering 
        reduces to standard centering.
        "select κ depending on the dataset, so that the correlation between
        Nk(x) and the local affinity <x, cκ(x)> is maximized" [2]_
        
    gamma : float, optional (default: 1.0)
        Control the degree of penalty, so that used the similarity score 
        is smaller depending on how likely a point is to become a hub.
        "Parameter γ can be tuned so as to maximally reduce the skewness 
        of the Nk distribution" [2]_.
        
    test_set_mask : ndarray, optional (default: None)
        Hold back data as a test set and perform centering on the remaining 
        data (training set).
    
    Returns
    ------- 
    S_lcent : ndarray
        Secondary similarity (localized centering) matrix.
        
    References
    ----------
    .. [1] Suzuki, I., Hara, K., Shimbo, M., Saerens, M., & Fukumizu, K. (2013). 
           Centering similarity measures to reduce hubs. In Proceedings of the 
           2013 Conference on Empirical Methods in Natural Language Processing 
           (pp 613–623). 
           Retrieved from https://www.aclweb.org/anthology/D/D13/D13-1058.pdf
    
    .. [2] Hara, K., Suzuki, I., Shimbo, M., Kobayashi, K., Fukumizu, K., & 
           Radovanović, M. (2015). Localized centering: Reducing hubness in 
           large-sample data hubness in high-dimensional data. In AAAI ’15: 
           Proceedings of the 29th AAAI Conference on Artificial Intelligence 
           (pp. 2645–2651).
    """
    if test_set_mask is None:
        test_set_mask = np.zeros(X.shape[0], np.bool)
    
    if metric == 'cosine':
        # Rescale vectors to unit length
        v = X / np.sqrt((X ** 2).sum(-1))[..., np.newaxis]
        # for unit vectors it holds inner() == cosine()
        sim = 1 - cos(v)
    # Localized centering meaningful for Euclidean?
    elif metric == 'euclidean':
        v = X # no scaling here...
        sim = 1 / (1 + l2(v))
    else:
        raise ValueError("Localized centering only supports cosine distances.")
    
    n = sim.shape[0]
    local_affinity = np.zeros(n)
    for i in range(n):
        x = v[i]
        sim_i = sim[i, :].copy()
        # set similarity of test examples to zero to exclude them from fit
        sim_i[test_set_mask] = 0
        # also exclude self
        sim_i[i] = 0
        nn = np.argsort(sim_i)[::-1][1 : kappa+1]
        c_kappa_x = np.mean(v[nn], 0)
        if metric == 'cosine':
            # c_kappa_x has no unit length in general
            local_affinity[i] = np.inner(x, c_kappa_x)
            #local_affinity[i] = cosine(x, c_kappa_x)
        elif metric == 'euclidean':
            local_affinity[i] = 1 / (1 + np.linalg.norm(x-c_kappa_x))
        else:
            raise ValueError("Localized centering only "
                             "supports cosine distances.")
    sim_lcent = sim - (local_affinity ** gamma)
    return sim_lcent